From 2cecf1a1201bf49b5bca931429c1a2dfe3694600 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 22 Oct 2025 14:02:26 -0500
Subject: [PATCH 01/52] ENH: Adding example for mesh decomposition WIP

---
 examples/mesh_decomp/CMakeLists.txt      |   13 +
 examples/mesh_decomp/install_ptscotch.sh |   31 +
 examples/mesh_decomp/mesh.h              | 1481 +++++++
 examples/mesh_decomp/mesh_decomp.cpp     |   32 +
 examples/mesh_decomp/mesh_io.h           | 4894 ++++++++++++++++++++++
 5 files changed, 6451 insertions(+)
 create mode 100644 examples/mesh_decomp/CMakeLists.txt
 create mode 100755 examples/mesh_decomp/install_ptscotch.sh
 create mode 100644 examples/mesh_decomp/mesh.h
 create mode 100644 examples/mesh_decomp/mesh_decomp.cpp
 create mode 100644 examples/mesh_decomp/mesh_io.h

diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt
new file mode 100644
index 00000000..721859a8
--- /dev/null
+++ b/examples/mesh_decomp/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.1.3)
+
+find_package(Matar REQUIRED)
+
+if (KOKKOS)
+  #find_package(Kokkos REQUIRED) #new
+  
+  add_executable(mech_decomp mesh_decomp.cpp)
+
+  add_definitions(-DHAVE_KOKKOS=1)
+
+  target_link_libraries(mesh_decomp ${LINKING_LIBRARIES})
+endif()
diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh
new file mode 100755
index 00000000..95ad7914
--- /dev/null
+++ b/examples/mesh_decomp/install_ptscotch.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Install script for Scotch and PT-Scotch
+set -e
+
+# Configuration
+LIB_DIR="lib"
+# SCOTCH_VERSION="7.0.4"
+# PTSCOTCH_VERSION="7.0.4"
+# INSTALL_PREFIX="$(pwd)/${LIB_DIR}"
+
+# echo "Installing Scotch and PT-Scotch to ${INSTALL_PREFIX}"
+
+# Create lib directory
+mkdir -p "${LIB_DIR}"
+cd ${LIB_DIR}
+# Clone and build Scotch
+echo "Cloning Scotch..."
+if [ -d "scotch" ]; then
+    rm -rf scotch
+fi
+git clone https://gitlab.inria.fr/scotch/scotch.git
+cd scotch
+
+echo "Building Scotch..."
+mkdir build
+cd build
+cmake ..
+make
+
+echo "Installation complete! Libraries installed in: ${INSTALL_PREFIX}"
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
new file mode 100644
index 00000000..599cb77d
--- /dev/null
+++ b/examples/mesh_decomp/mesh.h
@@ -0,0 +1,1481 @@
+/**********************************************************************************************
+� 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+#ifndef MESH_H
+#define MESH_H
+
+#include "matar.h"
+#include "state.h"
+#include "ref_elem.h"
+#include <cmath>
+
+#define PI 3.141592653589793
+
+using namespace mtr;
+
+namespace mesh_init
+{
+// element mesh types
+enum elem_name_tag
+{
+    linear_simplex_element = 0,
+    linear_tensor_element = 1,
+    arbitrary_tensor_element = 2
+};
+
+// other enums could go here on the mesh
+} // end namespace
+
+
+/*
+==========================
+Nodal indexing convention
+==========================
+
+              K
+              ^         J
+              |        /
+              |       /
+              |      /
+      6------------------7
+     /|                 /|
+    / |                / |
+   /  |               /  |
+  /   |              /   |
+ /    |             /    |
+4------------------5     |
+|     |            |     | ----> I
+|     |            |     |
+|     |            |     |
+|     |            |     |
+|     2------------|-----3
+|    /             |    /
+|   /              |   /
+|  /               |  /
+| /                | /
+|/                 |/
+0------------------1
+
+nodes are ordered for outward normal
+patch 0: [0,4,6,2]  xi-minus dir
+patch 1: [1,3,7,5]  xi-plus  dir
+patch 2: [0,1,5,4]  eta-minus dir
+patch 3: [3,2,6,7]  eta-plus  dir
+patch 4: [0,2,3,1]  zeta-minus dir
+patch 6: [4,5,7,6]  zeta-plus  dir
+*/
+
+// sort in ascending order using bubble sort
+KOKKOS_INLINE_FUNCTION
+void bubble_sort(size_t arr[], const size_t num)
+{
+    for (size_t i = 0; i < (num - 1); i++) {
+        for (size_t j = 0; j < (num - i - 1); j++) {
+            if (arr[j] > arr[j + 1]) {
+                size_t temp = arr[j];
+                arr[j]     = arr[j + 1];
+                arr[j + 1] = temp;
+            } // end if
+        } // end for j
+    } // end for i
+} // end function
+
+struct zones_in_elem_t
+{
+    private:
+        size_t num_zones_in_elem_;
+    public:
+        zones_in_elem_t() {
+        };
+
+        zones_in_elem_t(const size_t num_zones_in_elem_inp) {
+            this->num_zones_in_elem_ = num_zones_in_elem_inp;
+        };
+
+        // return global zone index for given local zone index in an element
+        size_t  host(const size_t elem_gid, const size_t zone_lid) const
+        {
+            return elem_gid * num_zones_in_elem_ + zone_lid;
+        };
+
+        // Return the global zone ID given an element gloabl ID and a local zone ID
+        KOKKOS_INLINE_FUNCTION
+        size_t operator()(const size_t elem_gid, const size_t zone_lid) const
+        {
+            return elem_gid * num_zones_in_elem_ + zone_lid;
+        };
+};
+
+// if material points are defined strictly internal to the element.
+struct gauss_in_elem_t
+{
+    private:
+        size_t num_gauss_in_elem_;
+    public:
+        gauss_in_elem_t() {
+        };
+
+        gauss_in_elem_t(const size_t num_gauss_in_elem_inp) {
+            this->num_gauss_in_elem_ = num_gauss_in_elem_inp;
+        };
+
+        // return global gauss index for given local gauss index in an element
+        size_t  host(const size_t elem_gid, const size_t leg_gauss_lid) const
+        {
+            return elem_gid * num_gauss_in_elem_ + leg_gauss_lid;
+        };
+
+        // Return the global gauss ID given an element gloabl ID and a local gauss ID
+        KOKKOS_INLINE_FUNCTION
+        size_t operator()(const size_t elem_gid, const size_t leg_gauss_lid) const
+        {
+            return elem_gid * num_gauss_in_elem_ + leg_gauss_lid;
+        };
+};
+
+/// if material points are defined at element interfaces
+struct lobatto_in_elem_t
+{
+    private:
+        size_t num_lobatto_in_elem_;
+    public:
+        lobatto_in_elem_t() {
+        };
+
+        lobatto_in_elem_t(const size_t num_lobatto_in_elem_inp) {
+            this->num_lobatto_in_elem_ = num_lobatto_in_elem_inp;
+        };
+
+        // return global gauss index for given local gauss index in an element
+        size_t  host(const size_t elem_gid, const size_t lob_gauss_lid) const
+        {
+            return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid;
+        };
+
+        // Return the global gauss ID given an element gloabl ID and a local gauss ID
+        KOKKOS_INLINE_FUNCTION
+        size_t operator()(const size_t elem_gid, const size_t lob_gauss_lid) const
+        {
+            return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid;
+        };
+};
+
+// struct nodes_in_zone_t {
+//     private:
+//          size_t num_nodes_in_zone_;
+//     public:
+//          nodes_in_zone_t(){};
+
+//          nodes_in_zone_t(const size_t num_nodes_in_zone_inp){
+//                  this->num_nodes_in_zone_ = num_nodes_in_zone_inp;
+//          };
+
+//         // return global zone index for given local zone index in an element
+//         size_t  host(const size_t zone_gid, const size_t node_lid) const{
+//             return zone_gid*num_nodes_in_zone_ + node_lid;
+//          };
+
+//         KOKKOS_INLINE_FUNCTION
+//         size_t operator()(const size_t zone_gid, const size_t node_lid) const{
+//             return zone_gid*num_nodes_in_zone_ + node_lid;
+//         };
+// };
+
+// mesh sizes and connectivity data structures
+struct Mesh_t
+{
+    // ******* Entity Definitions **********//
+    // Element: A hexahedral volume
+    // Zone: A discretization of an element base on subdividing the element using the nodes
+    // Node: A kinematic degree of freedom
+    // Surface: The 2D surface of the element
+    // Patch: A discretization of a surface by subdividing the surface using the nodes
+    // Corner: A element-node pair
+
+    // ---- Global Mesh Definitions ---- //
+    mesh_init::elem_name_tag elem_kind = mesh_init::linear_tensor_element; ///< The type of elements used in the mesh
+
+    size_t Pn = 1; ///< Polynomial order of kinematic space
+    size_t num_dims = 3; ///< Number of spatial dimension
+
+    // ---- Element Data Definitions ---- //
+    size_t num_elems;   ///< Number of elements in the mesh
+    size_t num_nodes_in_elem;   ///< Number of nodes in an element
+    size_t num_patches_in_elem; ///< Number of patches in an element
+    size_t num_surfs_in_elem;   ///< Number of surfaces in an element
+    size_t num_zones_in_elem;   ///< Number of zones in an element
+
+    size_t num_gauss_in_elem; ///< Number of Gauss points in an element
+    size_t num_lobatto_in_elem; ///< Number of Gauss Lobatto points in an element
+
+    DCArrayKokkos<size_t> nodes_in_elem; ///< Nodes in an element
+    CArrayKokkos<size_t> corners_in_elem; ///< Corners in an element -- this can just be a functor
+
+    RaggedRightArrayKokkos<size_t> elems_in_elem; ///< Elements connected to an element
+    CArrayKokkos<size_t> num_elems_in_elem; ///< Number of elements connected to an element
+
+    CArrayKokkos<size_t> patches_in_elem; ///< Patches in an element (including internal patches)
+    CArrayKokkos<size_t> surfs_in_elem; ///< Surfaces on an element
+
+    // CArrayKokkos <size_t> zones_in_elem; ///< Zones in an element
+    zones_in_elem_t zones_in_elem; ///< Zones in an element
+    lobatto_in_elem_t lobatto_in_elem; ///< Gauss Lobatto points in an element
+    gauss_in_elem_t gauss_in_elem; ///< Gauss points in an element
+
+    // ---- Node Data Definitions ---- //
+    size_t num_nodes; ///< Number of nodes in the mesh
+
+    RaggedRightArrayKokkos<size_t> corners_in_node; ///< Corners connected to a node
+    CArrayKokkos<size_t> num_corners_in_node;       ///< Number of corners connected to a node
+    RaggedRightArrayKokkos<size_t> elems_in_node; ///< Elements connected to a given node
+    RaggedRightArrayKokkos<size_t> nodes_in_node; ///< Nodes connected to a node along an edge
+    CArrayKokkos<size_t> num_nodes_in_node; ///< Number of nodes connected to a node along an edge
+
+    // ---- Surface Data Definitions ---- //
+    size_t num_surfs;   ///< Number of surfaces in the mesh
+    size_t num_nodes_in_surf;   ///< Number of nodes in a surface
+    size_t num_patches_in_surf; ///< Number of patches in a surface
+
+    CArrayKokkos<size_t> patches_in_surf; ///< Patches in a surface
+    CArrayKokkos<size_t> nodes_in_surf; ///< Nodes connected to a surface
+    CArrayKokkos<size_t> elems_in_surf; ///< Elements connected to a surface
+
+    // ---- Patch Data Definitions ---- //
+    size_t num_patches; ///< Number of patches in the mesh
+    size_t num_nodes_in_patch;  ///< Number of nodes in a patch
+    // size_t num_lobatto_in_patch; ///< Number of Gauss Lobatto nodes in a patch
+    // size_t num_gauss_in_patch; ///< Number of Gauss nodes in a patch
+
+    CArrayKokkos<size_t> nodes_in_patch; ///< Nodes connected to a patch
+    CArrayKokkos<size_t> elems_in_patch; ///< Elements connected to a patch
+    CArrayKokkos<size_t> surf_in_patch; ///< Surfaces connected to a patch (co-planar)
+
+    // ---- Corner Data Definitions ---- //
+    size_t num_corners; ///< Number of corners (define) in the mesh
+
+    // ---- Zone Data Definitions ---- //
+    size_t num_zones;   ///< Number of zones in the mesh
+    size_t num_nodes_in_zone; ///< Number of nodes in a zone
+
+    CArrayKokkos<size_t> nodes_in_zone; ///< Nodes defining a zone
+    // nodes_in_zone_t nodes_in_zone;
+
+    // ---- Boundary Data Definitions ---- //
+    size_t num_bdy_sets;    ///< Number of boundary sets
+    size_t num_bdy_nodes;   ///< Number of boundary nodes
+    size_t num_bdy_patches; ///< Number of boundary patches
+
+    CArrayKokkos<size_t> bdy_patches; ///< Boundary patches
+    CArrayKokkos<size_t> bdy_nodes;   ///< Boundary nodes
+
+    RaggedRightArrayKokkos<size_t> bdy_patches_in_set;  ///< Boundary patches in a boundary set
+    DCArrayKokkos<size_t> num_bdy_patches_in_set; ///< Number of boundary nodes in a set
+
+    RaggedRightArrayKokkos<size_t> bdy_nodes_in_set; ///< Boundary nodes in a boundary set
+    DCArrayKokkos<size_t> num_bdy_nodes_in_set; ///< Number of boundary nodes in a set
+
+    // initialization methods
+    void initialize_nodes(const size_t num_nodes_inp)
+    {
+        num_nodes = num_nodes_inp;
+
+        return;
+    }; // end method
+
+    // initialization methods
+    void initialize_elems(const size_t num_elems_inp, const size_t num_dims_inp)
+    {
+        num_dims = num_dims_inp;
+        num_nodes_in_elem = 1;
+        
+        for (int dim = 0; dim < num_dims; dim++) {
+            num_nodes_in_elem *= 2;
+        }
+        num_elems       = num_elems_inp;
+        nodes_in_elem   = DCArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem");
+        corners_in_elem = CArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.corners_in_elem");
+
+        // 1 Gauss point per element
+        num_gauss_in_elem = 1;
+
+        // 1 zone per element
+        num_zones_in_elem = 1;
+
+        gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem);
+
+        return;
+    }; // end method
+
+    // initialization method
+    void initialize_elems_Pn(const size_t num_elems_inp,
+        const size_t num_nodes_in_elem_inp,
+        const size_t num_gauss_leg_in_elem_inp,
+        const size_t num_zones_in_elem_inp,
+        const size_t num_nodes_in_zone_inp,
+        const size_t num_surfs_in_elem_inp,
+        const size_t num_dims_inp)
+    {
+        num_dims  = num_dims_inp;
+        num_elems = num_elems_inp;
+
+        num_nodes_in_elem     = num_nodes_in_elem_inp;
+        num_nodes_in_zone     = num_nodes_in_zone_inp;
+        num_gauss_in_elem = num_gauss_leg_in_elem_inp;
+        num_zones_in_elem     = num_zones_in_elem_inp;
+        num_surfs_in_elem     = num_surfs_in_elem_inp;
+
+        num_zones = num_zones_in_elem * num_elems;
+
+        nodes_in_elem    = DCArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem");
+        corners_in_elem  = CArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.corners_in_elem");
+        zones_in_elem    = zones_in_elem_t(num_zones_in_elem);
+        surfs_in_elem    = CArrayKokkos<size_t>(num_elems, num_surfs_in_elem, "mesh.surfs_in_zone");
+        nodes_in_zone    = CArrayKokkos<size_t>(num_zones, num_nodes_in_zone, "mesh.nodes_in_zone");
+        gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem);
+
+        return;
+    }; // end method
+
+    // initialization methods
+    void initialize_corners(const size_t num_corners_inp)
+    {
+        num_corners = num_corners_inp;
+
+        return;
+    }; // end method
+
+    // build the corner mesh connectivity arrays
+    void build_corner_connectivity()
+    {
+        num_corners_in_node = CArrayKokkos<size_t>(num_nodes, "mesh.num_corners_in_node"); // stride sizes
+
+        // initializing the number of corners (node-cell pair) to be zero
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            num_corners_in_node(node_gid) = 0;
+        });
+
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, {
+                // get the global_id of the node
+                size_t node_gid = nodes_in_elem(elem_gid, node_lid);
+
+                // increment the number of corners attached to this point
+                num_corners_in_node(node_gid) = num_corners_in_node(node_gid) + 1;
+            });  // end FOR_ALL over nodes in element
+        } // end for elem_gid
+
+        // the stride sizes are the num_corners_in_node at the node
+        corners_in_node = RaggedRightArrayKokkos<size_t>(num_corners_in_node, "mesh.corners_in_node");
+
+        CArrayKokkos<size_t> count_saved_corners_in_node(num_nodes, "count_saved_corners_in_node");
+
+        // reset num_corners to zero
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            count_saved_corners_in_node(node_gid) = 0;
+        });
+
+        // the elems_in_elem data type
+        elems_in_node = RaggedRightArrayKokkos<size_t>(num_corners_in_node, "mesh.elems_in_node");
+
+        // populate the elements connected to a node list and corners in a node
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, {
+                // get the global_id of the node
+                size_t node_gid = nodes_in_elem(elem_gid, node_lid);
+
+                // the column index is the num corners saved
+                size_t j = count_saved_corners_in_node(node_gid);
+
+                // Save corner index to this node_gid
+                size_t corner_gid = node_lid + elem_gid * num_nodes_in_elem;  // this can be a functor
+                corners_in_node(node_gid, j) = corner_gid;
+
+                elems_in_node(node_gid, j) = elem_gid; // save the elem_gid
+
+                // Save corner index to element
+                size_t corner_lid = node_lid;
+                corners_in_elem(elem_gid, corner_lid) = corner_gid;
+
+                // increment the number of corners saved to this node_gid
+                count_saved_corners_in_node(node_gid) = count_saved_corners_in_node(node_gid) + 1;
+            });  // end FOR_ALL over nodes in element
+        } // end for elem_gid
+
+        return;
+    } // end of build_corner_connectivity
+
+    // build elem connectivity arrays
+    void build_elem_elem_connectivity()
+    {
+        // find the max number of elems around a node
+        size_t max_num_elems_in_node;
+        size_t max_num_lcl;
+        FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, {
+            // num_corners_in_node = num_elems_in_node
+            size_t max_num = num_corners_in_node(node_gid);
+
+            if (max_num > max_num_lcl) {
+                max_num_lcl = max_num;
+            }
+        }, max_num_elems_in_node); // end parallel reduction on max
+        Kokkos::fence();
+
+        // a temporary ragged array to save the elems around an elem
+        DynamicRaggedRightArrayKokkos<size_t> temp_elems_in_elem(num_nodes, num_nodes_in_elem * max_num_elems_in_node, "temp_elems_in_elem");
+
+        num_elems_in_elem = CArrayKokkos<size_t>(num_elems, "mesh.num_elems_in_elem");
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            num_elems_in_elem(elem_gid) = 0;
+        });
+        Kokkos::fence();
+
+        // find and save neighboring elem_gids of an elem
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+                // get the gid for the node
+                size_t node_id = nodes_in_elem(elem_gid, node_lid);
+
+                // loop over all elems connected to node_gid
+                for (int elem_lid = 0; elem_lid < num_corners_in_node(node_id); elem_lid++) {
+                    // get the global id for the neighboring elem
+                    size_t neighbor_elem_gid = elems_in_node(node_id, elem_lid);
+
+                    // a flag to save (=1) or not (=0)
+                    size_t save = 1;
+
+                    // a true neighbor_elem_id is not equal to elem_gid
+                    if (neighbor_elem_gid == elem_gid) {
+                        save = 0;  // don't save
+                    } // end if
+
+                    // check to see if the neighbor_elem_gid has been saved already
+                    size_t num_saved = temp_elems_in_elem.stride(elem_gid);
+                    for (size_t i = 0; i < num_saved; i++) {
+                        if (neighbor_elem_gid == temp_elems_in_elem(elem_gid, i)) {
+                            save = 0;   // don't save, it has been saved already
+                        } // end if
+                    } // end for i
+
+                    if (save == 1) {
+                        // increment the number of neighboring elements saved
+                        temp_elems_in_elem.stride(elem_gid)++;
+
+                        // save the neighboring elem_gid
+                        temp_elems_in_elem(elem_gid, num_saved) = neighbor_elem_gid;
+                    } // end if save
+                } // end for elem_lid in a node
+            }  // end for node_lid in an elem
+
+            // save the actial stride size
+            num_elems_in_elem(elem_gid) = temp_elems_in_elem.stride(elem_gid);
+        }); // end FOR_ALL elems
+        Kokkos::fence();
+
+        // compress out the extra space in the temp_elems_in_elem
+        elems_in_elem = RaggedRightArrayKokkos<size_t>(num_elems_in_elem, "mesh.elems_in_elem");
+
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            for (size_t i = 0; i < num_elems_in_elem(elem_gid); i++) {
+                elems_in_elem(elem_gid, i) = temp_elems_in_elem(elem_gid, i);
+            } // end for i
+        });  // end FOR_ALL elems
+        Kokkos::fence();
+
+        return;
+    } // end of build_elem_elem_connectivity
+
+    // build the patches
+    void build_patch_connectivity()
+    {
+        // WARNING WARNING
+        // the mesh element kind should be in the input file and set when reading mesh
+        // mesh_elem_kind = mesh_init::linear_tensor_element; // MUST BE SET
+
+        // building patches
+
+        num_nodes_in_patch = 2 * (num_dims - 1);  // 2 (2D) or 4 (3D)
+        num_surfs_in_elem  = 2 * num_dims; // 4 (2D) or 6 (3D)
+
+        // num_lobatto_in_patch = int(pow(3, num_dims-1));
+
+        // num_gauss_in_patch = 2*(num_dims-1);
+
+        size_t num_patches_in_surf;  // = Pn_order or = Pn_order*Pn_order
+
+        size_t num_1D = Pn + 1; // number of nodes in 1D
+
+        // num quad points 1D //
+        // size_t num_lob_1D = 2*Pn + 1;
+        // size_t num_1D = 2*Pn;
+
+        DCArrayKokkos<size_t> node_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_nodes_in_patch);
+
+        // DCArrayKokkos <size_t> lobatto_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_lobatto_in_patch);
+
+        // DCArrayKokkos <size_t> gauss_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_gauss_in_patch);
+
+        printf("Number of dimensions = %zu \n", num_dims);
+
+        if (num_dims == 3) {
+            // num_patches_in_surf = [1^2, 2^2, 3^2, 4^2, ... , Pn^2]
+
+            num_patches_in_surf = Pn * Pn;
+
+            num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem;
+
+            // nodes in a patch in the element
+            node_ordering_in_elem = DCArrayKokkos<size_t>(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem");
+
+            // lobatto_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_lobatto_in_patch);
+
+            // gauss_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_gauss_in_patch);
+
+            // printf("num_patches_in_elem = %zu \n", num_patches_in_elem);
+            // printf("num_nodes_in_patch = %zu \n", num_nodes_in_patch);
+            // printf("num_lobatto_in_patch = %zu \n", num_lobatto_in_patch);
+            // printf("num_gauss_in_patch = %zu \n", num_gauss_in_patch);
+            // printf("Number of surfaces = %zu \n", num_surfs_in_elem);
+        }
+        else {
+            num_patches_in_surf = Pn;
+
+            num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem;
+
+            // nodes in a patch in the element
+            node_ordering_in_elem = DCArrayKokkos<size_t>(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem");
+            // lobatto_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_lobatto_in_patch);
+            // gauss_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_gauss_in_patch);
+        } // end if dim
+
+        // On the CPU, set the node order for the patches in an element
+        // classic linear elements
+        if (elem_kind == mesh_init::linear_tensor_element) {
+            if (num_dims == 3) {
+
+                 size_t temp_node_lids[24] = { 0, 4, 6, 2,
+                                              1, 3, 7, 5,
+                                              0, 1, 5, 4,
+                                              3, 2, 6, 7,
+                                              0, 2, 3, 1,
+                                              4, 5, 7, 6 };
+
+                int count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+
+                // count = 0;
+                // elem_patch_lid = 0;
+                // for ( size_t surf_lid=0; surf_lid < num_surfs_in_elem; surf_lid++ ){
+                //     for ( size_t patch_lid=0; patch_lid < num_patches_in_surf; patch_lid++ ){
+                //         for ( size_t lobatto_lid=0; lobatto_lid < num_lobatto_in_patch; lobatto_lid++ ){
+                //             lobatto_ordering_in_elem.host( elem_patch_lid, lobatto_lid ) = temp_node_lids[count];
+                //             count++;
+                //         } // end for node_lid
+                //         elem_patch_lid ++;
+                //     } // end for patch_lid in a surface
+                // } // end for i
+            }
+            else {
+                //   J
+                //   |
+                // 3---2
+                // |   |  -- I
+                // 0---1
+                //
+                size_t temp_node_lids[8] =
+                { 0, 3,
+                  1, 2,
+                  0, 1,
+                  3, 2 };
+
+                int count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+            } // end if on dims
+        } // end of linear element iwth classic numbering
+        // -----
+        // arbitrary-order element
+        // -----
+        else if (elem_kind == mesh_init::arbitrary_tensor_element) {
+            size_t temp_node_lids[num_nodes_in_patch * num_patches_in_surf * num_surfs_in_elem];
+
+            printf("arbitrary order tensor element \n");
+
+            // arbitrary-order node ordering in patches of an element
+            if (num_dims == 3) {
+                /*
+
+                    i,j,k layout
+
+                    k  j
+                    | /
+                    |/
+                    o-->i
+
+
+                    i=0,imax
+                    o (j+1,k+1)
+                    /|
+                    (j,k+1) o o (j+1,k)
+                    |/
+                    (j,k) o
+
+                    */
+
+                int count = 0;
+
+                int i_patch, j_patch, k_patch;
+
+                // i-minus-dir patches
+
+                i_patch = 0;
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int j = 0; j < num_1D - 1; j++) {
+                        // node_lid 0 in patch
+                        // index = i + j*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        // index = i + j*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        // index = i + (j+1)*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D);
+                        count++;
+                    } // end for k
+                } // end for j
+
+                // printf("i-minus\n");
+
+                // i-plus-dir patches
+                i_patch = num_1D - 1;
+                // printf("num_1D = %zu \n", num_1D);
+                // printf("i_patch = %d \n", i_patch);
+                printf("num_nodes_in_elem %zu \n", num_nodes_in_elem);
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int j = 0; j < num_1D - 1; j++) {
+                        // node_lid 0 in patch
+                        // index = i + j*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        // index = i + (j+1)*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        // index = i + j*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D);
+                        count++;
+                    } // end for j
+                } // end for k
+
+                // printf("i-plus\n");
+
+                /*
+
+                    i,j,k layout
+
+                    k  j
+                    | /
+                    |/
+                    o-->i
+
+
+                    j=0,jmax
+
+                    (i,,k+1) o--o (i+1,,k+1)
+                    |  |
+                    (i,,k) o--o (i+1,,k)
+
+                    */
+
+                j_patch = 0;
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D);
+                        count++;
+                    } // end for i
+                } // end for k
+
+                // printf("j-minus\n");
+
+                j_patch = num_1D - 1;
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D);
+                        count++;
+                    } // end for i
+                } // end for k
+
+                // printf("j-plus\n");
+
+                /*
+
+                    i,j,k layout
+
+                    k  j
+                    | /
+                    |/
+                    o-->i
+
+
+                    k=0,kmax
+
+                    (i,j+1) o--o (i+1,j+1)
+                    /  /
+                    (i,j) o--o (i+1,j)
+
+                    */
+
+                k_patch = 0;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D);
+                        count++;
+                    } // end for i
+                } // end for j
+                  // printf("k-minus\n");
+
+                k_patch = num_1D - 1;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D);
+                        count++;
+                    } // end for i
+                } // end for j
+
+                // printf("k-plus\n");
+
+                count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < 6; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < 4; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+            }  // end if 3D
+            //
+            else{
+                // 2D arbitrary order elements
+                int count = 0;
+                int i_patch, j_patch;
+
+                // i-minus-dir patches
+
+                i_patch = 0;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D;
+                    count++;
+
+                    temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D;
+                    count++;
+                } // end for j
+
+                // i-plus-dir patches
+                i_patch = num_1D - 1;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D;
+                    count++;
+
+                    temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D;
+                    count++;
+                } // end for j
+
+                j_patch = 0;
+                for (int i = 0; i < num_1D - 1; i++) {
+                    temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D);
+                    count++;
+
+                    temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D);
+                    count++;
+                } // end for i
+
+                j_patch = num_1D - 1;
+                for (int i = 0; i < num_1D - 1; i++) {
+                    temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D);
+                    count++;
+
+                    temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D);
+                    count++;
+                } // end for i
+
+                count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+            } // end else on dim
+
+            // build zones in high order element
+            FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+                size_t node_lids[8]; // temp storage for local node ids
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int j = 0; j < num_1D - 1; j++) {
+                        for (int i = 0; i < num_1D - 1; i++) {
+                            node_lids[0] = i + j * (num_1D) + k * (num_1D) * (num_1D); // i,j,k
+                            node_lids[1] = i + 1 + j * (num_1D) + k * (num_1D) * (num_1D); // i+1, j, k
+                            node_lids[2] = i + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i,j+1,k
+                            node_lids[3] = i + 1 + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i+1, j+1, k
+                            node_lids[4] = i + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i, j , k+1
+                            node_lids[5] = i + 1 + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i + 1, j , k+1
+                            node_lids[6] = i + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i,j+1,k+1
+                            node_lids[7] = i + 1 + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i+1, j+1, k+1
+
+                            size_t zone_lid = i + j * (num_1D - 1) + k * (num_1D - 1) * (num_1D - 1);
+                            size_t zone_gid = zones_in_elem(elem_gid, zone_lid);
+
+                            for (int node_lid = 0; node_lid < 8; node_lid++) {
+                                // get global id for the node
+                                size_t node_gid = nodes_in_elem(elem_gid, node_lids[node_lid]);
+                                nodes_in_zone(zone_gid, node_lid) = node_gid;
+                            }
+                        } // i
+                    } // j
+                } // k
+            }); // end FOR_ALL elem_gid
+        } // end if arbitrary-order element
+        else {
+            printf("\nERROR: mesh type is not known \n");
+        } // end if
+
+        // update the device
+        node_ordering_in_elem.update_device();
+        Kokkos::fence();
+
+        printf("Built node ordering \n");
+
+        // for saving the hash keys of the patches and then the neighboring elem_gid
+        CArrayKokkos<int> hash_keys_in_elem(num_elems, num_patches_in_elem, num_nodes_in_patch, "hash_keys_in_elem"); // always 4 ids in 3D
+
+        // for saving the adjacent patch_lid, which is the slide_lid
+        // CArrayKokkos <size_t> neighboring_side_lids (num_elems, num_patches_in_elem);
+
+        // allocate memory for the patches in the elem
+        patches_in_elem = CArrayKokkos<size_t>(num_elems, num_patches_in_elem, "mesh.patches_in_elem");
+
+        // a temporary storage for the patch_gids that are on the mesh boundary
+        CArrayKokkos<size_t> temp_bdy_patches(num_elems * num_patches_in_elem, "temp_bdy_patches");
+
+        // step 1) calculate the hash values for each patch in the element
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) {
+                size_t sorted_patch_nodes[4];  // note: cannot be allocated with num_nodes_in_patch
+
+                // first save the patch nodes
+                for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) {
+                    // get the local node index of the element for this patch and node in patch
+                    size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid);
+
+                    // get and save the global index of the node
+                    sorted_patch_nodes[patch_node_lid] = nodes_in_elem(elem_gid, node_lid);
+                }  // end for node_lid
+
+                // sort nodes from smallest to largest
+                bubble_sort(sorted_patch_nodes, num_nodes_in_patch);
+
+                // save hash_keys in the this elem
+                for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) {
+                    hash_keys_in_elem(elem_gid, patch_lid, key_lid) = sorted_patch_nodes[key_lid];  // 4 node values are keys
+                } // for
+            } // end for patch_lid
+        }); // end FOR_ALL elem_gid
+
+        DCArrayKokkos<size_t> num_values(2, "num_values");
+
+        // 8x8x8 mesh
+        // num_patches = 8*8*9*3 = 1728
+        // bdy_patches = 8*8*6 = 384
+        //
+
+        // step 2: walk around the elements and save the elem pairs that have the same hash_key
+        RUN_CLASS({
+            // serial execution on GPU
+
+            size_t patch_gid     = 0;
+            size_t bdy_patch_gid = 0;
+
+            for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                // loop over the patches in this elem
+                for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) {
+                    size_t exit = 0;
+
+                    // negative values mean the patch has not been saved
+                    if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) {
+                        // find the nighboring patch with the same hash_key
+
+                        for (size_t neighbor_elem_lid = 0; neighbor_elem_lid < num_elems_in_elem(elem_gid); neighbor_elem_lid++) {
+                            // get the neighboring element global index
+                            size_t neighbor_elem_gid = elems_in_elem(elem_gid, neighbor_elem_lid);
+
+                            for (size_t neighbor_patch_lid = 0; neighbor_patch_lid < num_patches_in_elem; neighbor_patch_lid++) {
+                                size_t save_it = 0;
+                                for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) {
+                                    if (hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, key_lid) == hash_keys_in_elem(elem_gid, patch_lid, key_lid)) {
+                                        save_it++; // if save_it == num_nodes after this loop, then it is a match
+                                    }
+                                } // end key loop
+
+                                // this hash is from the nodes on the patch
+                                if (save_it == num_nodes_in_patch) {
+                                    // make it negative, because we saved it
+                                    hash_keys_in_elem(elem_gid, patch_lid, 0) = -1;
+                                    hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, 0) = -1;
+
+                                    // save the patch_lids for the adjacent sides
+                                    // neighboring_side_lids(elem_gid, patch_lid) = neighbor_patch_lid;
+                                    // neighboring_side_lids(neighbor_elem_gid, neighbor_patch_lid) = patch_lid;
+
+                                    // save the patch_gid
+                                    patches_in_elem(elem_gid, patch_lid) = patch_gid;
+                                    patches_in_elem(neighbor_elem_gid, neighbor_patch_lid) = patch_gid;
+
+                                    patch_gid++;
+
+                                    exit = 1;
+                                    break;
+                                } // end if
+                            } // end for loop over a neighbors patch set
+
+                            if (exit == 1) {
+                                break;
+                            }
+                        } // end for loop over elem neighbors
+                    } // end if hash<0
+                } // end for patch_lid
+
+                // loop over the patches in this element again
+                // remaining positive hash key values are the boundary patches
+                for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) {
+                    if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) {
+                        hash_keys_in_elem(elem_gid, patch_lid, 0) = -1;  // make it negative, because we saved it
+
+                        // neighboring_side_lids(elem_gid, patch_lid) = patch_lid;
+
+                        patches_in_elem(elem_gid, patch_lid) = patch_gid;
+                        temp_bdy_patches(bdy_patch_gid) = patch_gid;
+
+                        patch_gid++;
+                        bdy_patch_gid++;
+                    } // end if
+                }  // end for over patch_lid
+            }  // end for over elem_gid
+
+            // the num_values is because the values passed in are const, so a const pointer is needed
+            num_values(0) = patch_gid;     // num_patches = patch_gid;
+            num_values(1) = bdy_patch_gid; // num_bdy_patches = bdy_patch_gid;
+        }); // end RUN
+        Kokkos::fence();
+
+        num_values.update_host();
+        Kokkos::fence();
+
+        num_patches     = num_values.host(0);
+        // this lines assumes num_surfs ==  num_patches, only valid for 1st order elements
+        num_surfs       = num_values.host(0);
+        num_bdy_patches = num_values.host(1);
+
+        // size_t mesh_1D = 60;
+        // size_t exact_num_patches = (mesh_1D*mesh_1D)*(mesh_1D+1)*3;
+        // size_t exact_num_bdy_patches = (mesh_1D*mesh_1D)*6;
+        // printf("num_patches = %lu, exact = %lu \n", num_patches, exact_num_patches);
+        // printf("num_bdy_patches = %lu exact = %lu \n", num_bdy_patches, exact_num_bdy_patches);
+
+        // printf("Num patches = %lu \n", num_patches);
+        // printf("Num boundary patches = %lu \n", num_bdy_patches);
+
+        elems_in_patch = CArrayKokkos<size_t>(num_patches, 2, "mesh.elems_in_patch");
+        nodes_in_patch = CArrayKokkos<size_t>(num_patches, num_nodes_in_patch, "mesh.nodes_in_patch");
+
+        // a temporary variable to help populate patch structures
+        CArrayKokkos<size_t> num_elems_in_patch_saved(num_patches, "num_elems_in_patch_saved");
+
+        // initialize the number of elems in a patch saved to zero
+        FOR_ALL_CLASS(patch_gid, 0, num_patches, {
+            num_elems_in_patch_saved(patch_gid) = 0;
+        });
+
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            FOR_ALL_CLASS(patch_lid, 0, num_patches_in_elem, {
+                size_t patch_gid = patches_in_elem(elem_gid, patch_lid);
+
+                size_t num_saved = num_elems_in_patch_saved(patch_gid);
+
+                elems_in_patch(patch_gid, num_saved) = elem_gid;
+
+                // record that an elem_gid was saved
+                num_elems_in_patch_saved(patch_gid)++;
+
+                // save the nodes on this patch
+                for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) {
+                    // get the local node index of the element for this patch and node in patch
+                    size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid);
+
+                    // get and save the global index of the node
+                    nodes_in_patch(patch_gid, patch_node_lid) = nodes_in_elem(elem_gid, node_lid);
+                }  // end for node_lid
+            }); // end FOR_ALL patch_lid
+        } // end for
+
+        // Surfaces and patches in surface
+        if (elem_kind == mesh_init::arbitrary_tensor_element) {
+            // allocate memory for the surfaces in the elem
+            surfs_in_elem = CArrayKokkos<size_t>(num_elems, num_surfs_in_elem);
+
+            // allocate memory for surface data structures
+            num_surfs = num_patches / num_patches_in_surf;
+
+            patches_in_surf = CArrayKokkos<size_t>(num_surfs, num_patches_in_surf, "mesh.patches_in_surf");
+            elems_in_surf   = CArrayKokkos<size_t>(num_surfs, 2, "mesh.elems_in_surf");
+            surf_in_patch   = CArrayKokkos<size_t>(num_patches, "mesh.surf_in_patch");
+
+            FOR_ALL_CLASS(surf_gid, 0, num_surfs, {
+                // loop over the patches in this surface
+                for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                    // get patch_gid
+                    size_t patch_gid = patch_lid + surf_gid * num_patches_in_surf;
+
+                    // save the patch_gids
+                    patches_in_surf(surf_gid, patch_lid) = patch_gid;
+
+                    // save the surface this patch belongs to
+                    surf_in_patch(patch_gid) = surf_gid;
+                } // end for
+
+                // get first patch in the surface, and populate elem surface structures
+                size_t this_patch_gid = surf_gid * num_patches_in_surf;
+
+                elems_in_surf(surf_gid, 0) = elems_in_patch(this_patch_gid, 0);  // elem_gid0
+                elems_in_surf(surf_gid, 1) = elems_in_patch(this_patch_gid, 1);  // elem_gid1
+            }); // end FOR_ALL over surfaces
+
+            // save surfaces in elem
+            FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    // get the local patch_lid
+                    size_t patch_lid = surf_lid * num_patches_in_surf;
+
+                    // get the patch_gids in this element
+                    size_t patch_gid = patches_in_elem(elem_gid, patch_lid);
+
+                    // save the surface gid
+                    // Grab the first patch on surf and return surface_gid from surf_in_patch //
+                    surfs_in_elem(elem_gid, surf_lid) = surf_in_patch(patch_gid);
+                } // end surf_lid
+            });
+
+            DViewCArrayKokkos<size_t> surf_node_ordering_in_elem;
+
+            if (num_dims == 3) {
+                // num_1D = Pn+1
+                int    num_surface_nodes = num_surfs_in_elem * pow(num_1D, num_dims - 1);
+                size_t temp_surf_node_lids[num_surface_nodes];
+                // 2D arbitrary order elements
+                int count = 0;
+
+                for (int i_surf = 0; i_surf < 2; i_surf++) {
+                    for (int k = 0; k < num_1D; k++) {
+                        for (int j = 0; j < num_1D; j++) {
+                            // node_lid 0 in patch
+                            // index = i + j*num_1D + k*num_1D*num_1D;
+                            temp_surf_node_lids[count] = i_surf + j * num_1D + k * num_1D * num_1D;
+                            count++;
+                        } // end for k
+                    } // end for j
+                }
+
+                for (int j_surf = 0; j_surf < 2; j_surf++) {
+                    for (int k = 0; k < num_1D; k++) {
+                        for (int i = 0; i < num_1D; i++) {
+                            // node_lid 0 in patch
+                            temp_surf_node_lids[count] = i + j_surf * num_1D + k * num_1D * num_1D;
+                            count++;
+                        }
+                    }
+                }
+
+                for (int k_surf = 0; k_surf < 2; k_surf++) {
+                    for (int j = 0; j < num_1D; j++) {
+                        for (int i = 0; i < num_1D; i++) {
+                            // node_lid 0 in patch
+                            temp_surf_node_lids[count] = i + j * num_1D + k_surf * num_1D * num_1D;
+                            count++;
+                        }
+                    }
+                }
+
+                nodes_in_surf = CArrayKokkos<size_t>(num_surfs, num_1D * num_1D, "mesh.nodes_in_surf");
+
+                num_nodes_in_surf = num_1D * num_1D;
+                surf_node_ordering_in_elem = DViewCArrayKokkos<size_t>(&temp_surf_node_lids[0], num_surfs_in_elem, num_nodes_in_surf);
+                surf_node_ordering_in_elem.update_device();
+                for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                    FOR_ALL_CLASS(surf_lid, 0, num_surfs_in_elem, {
+                        int surf_gid = surfs_in_elem(elem_gid, surf_lid);
+                        for (int surf_node_lid = 0; surf_node_lid < num_nodes_in_surf; surf_node_lid++) {
+                            int node_lid = surf_node_ordering_in_elem(surf_lid, surf_node_lid);
+                            int node_gid = nodes_in_elem(elem_gid, node_lid);
+                            nodes_in_surf(surf_gid, surf_node_lid) = node_gid;
+                        } // end loop over surf_node_lid
+                    }); // end loop over FOR_ALL_CLASS
+                } // end loop over elem_gid
+            } // end 3D scope
+        } // end of high-order mesh objects
+
+        // ----------------
+
+        // allocate memory for boundary patches
+        bdy_patches = CArrayKokkos<size_t>(num_bdy_patches, "mesh.bdy_patches");
+
+        FOR_ALL_CLASS(bdy_patch_gid, 0, num_bdy_patches, {
+            bdy_patches(bdy_patch_gid) = temp_bdy_patches(bdy_patch_gid);
+        }); // end FOR_ALL bdy_patch_gid
+
+        // find and store the boundary nodes
+        CArrayKokkos<size_t> temp_bdy_nodes(num_nodes, "temp_bdy_nodes");
+        CArrayKokkos<long long int> hash_bdy_nodes(num_nodes, "hash_bdy_nodes");
+
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            hash_bdy_nodes(node_gid) = -1;
+        }); // end for node_gid
+
+        // Parallel loop over boundary patches
+        DCArrayKokkos<size_t> num_bdy_nodes_saved(1, "num_bdy_nodes_saved");
+
+        RUN_CLASS({
+            num_bdy_nodes_saved(0) = 0;
+            for (size_t bdy_patch_gid = 0; bdy_patch_gid < num_bdy_patches; bdy_patch_gid++) {
+                // get the global index of the patch that is on the boundary
+                size_t patch_gid = bdy_patches(bdy_patch_gid);
+
+                // tag the boundary nodes
+                for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                    size_t node_gid = nodes_in_patch(patch_gid, node_lid);
+
+                    if (hash_bdy_nodes(node_gid) < 0) {
+                        hash_bdy_nodes(node_gid) = node_gid;
+                        temp_bdy_nodes(num_bdy_nodes_saved(0)) = node_gid;
+
+                        // printf("bdy_node = %lu \n", node_gid);
+                        num_bdy_nodes_saved(0)++;
+                    } // end if
+                } // end for node_lid
+            } // end for loop over bdy_patch_gid
+        });  // end RUN
+        Kokkos::fence();
+
+        // copy value to host (CPU)
+        num_bdy_nodes_saved.update_host();
+        Kokkos::fence();
+
+        // save the number of bdy_nodes to Mesh_t
+        num_bdy_nodes = num_bdy_nodes_saved.host(0);
+
+        bdy_nodes = CArrayKokkos<size_t>(num_bdy_nodes, "mesh.bdy_nodes");
+
+        FOR_ALL_CLASS(node_gid, 0, num_bdy_nodes, {
+            bdy_nodes(node_gid) = temp_bdy_nodes(node_gid);
+        }); // end for boundary node_gid
+
+        // printf("Num boundary nodes = %lu \n", num_bdy_nodes);
+
+        return;
+    } // end patch connectivity method
+
+    // build the patches
+    void build_node_node_connectivity()
+    {
+        // find the max number of elems around a node
+        size_t max_num_elems_in_node;
+        size_t max_num_lcl;
+        FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, {
+            // num_corners_in_node = num_elems_in_node
+            size_t max_num = num_corners_in_node(node_gid);
+
+            if (max_num > max_num_lcl) {
+                max_num_lcl = max_num;
+            }
+        }, max_num_elems_in_node); // end parallel reduction on max
+        Kokkos::fence();
+
+        // each elem corner will contribute 3 edges to the node. Those edges will likely be the same
+        // ones from an adjacent element so it is a safe estimate to multiply by 3
+        DynamicRaggedRightArrayKokkos<size_t> temp_nodes_in_nodes(num_nodes, max_num_elems_in_node * 3, "temp_nodes_in_nodes");
+
+        num_nodes_in_node = CArrayKokkos<size_t>(num_nodes, "mesh.num_nodes_in_node");
+
+        // walk over the patches and save the node node connectivity
+        RUN_CLASS({
+            if (num_dims == 3) {
+                for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) {
+                    for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                        // the first node on the edge
+                        size_t node_gid_0 = nodes_in_patch(patch_gid, node_lid);
+
+                        // second node on this edge
+                        size_t node_gid_1;
+
+                        if (node_lid == num_nodes_in_patch - 1) {
+                            node_gid_1 = nodes_in_patch(patch_gid, 0);
+                        }
+                        else {
+                            node_gid_1 = nodes_in_patch(patch_gid, node_lid + 1);
+                        } // end if
+
+                        size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0);
+                        size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1);
+
+                        size_t save_0 = 1;
+                        size_t save_1 = 1;
+
+                        // check to see if the node_gid_1 was already saved
+                        for (size_t contents_lid = 0; contents_lid < num_saved_0; contents_lid++) {
+                            if (temp_nodes_in_nodes(node_gid_0, contents_lid) == node_gid_1) {
+                                save_0 = 0; // don't save, it was already saved
+                            }
+                        }
+
+                        // check to see if the node_gid_0 was already saved
+                        for (size_t contents_lid = 0; contents_lid < num_saved_1; contents_lid++) {
+                            if (temp_nodes_in_nodes(node_gid_1, contents_lid) == node_gid_0) {
+                                save_1 = 0;  // don't save, it was already saved
+                            }
+                        }
+
+                        if (save_0 == 1) {
+                            // increment the number of nodes in a node saved
+                            temp_nodes_in_nodes.stride(node_gid_0)++;
+
+                            // save the second node to the first node
+                            temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1;
+                        }
+
+                        if (save_1 == 1) {
+                            // increment the number of nodes in a node saved
+                            temp_nodes_in_nodes.stride(node_gid_1)++;
+
+                            // save the first node to the second node
+                            temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0;
+                        }
+
+                        // save the strides
+                        num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0);
+                        num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1);
+                    } // end for node in patch
+                } // end for patches
+            } // end if 3D
+            else {
+                for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) {
+                    // the first node on the edge
+                    size_t node_gid_0 = nodes_in_patch(patch_gid, 0);
+
+                    // second node on this edge
+                    size_t node_gid_1 = nodes_in_patch(patch_gid, 1);
+
+                    size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0);
+                    size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1);
+
+                    // increment the number of nodes in a node saved
+                    temp_nodes_in_nodes.stride(node_gid_0)++;
+                    temp_nodes_in_nodes.stride(node_gid_1)++;
+
+                    // save the second node to the first node
+                    temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1;
+
+                    // save the first node to the second node
+                    temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0;
+
+                    // save the strides
+                    num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0);
+                    num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1);
+                } // end for patches
+            } // end if 2D
+        });  // end RUN
+        Kokkos::fence();
+
+        nodes_in_node = RaggedRightArrayKokkos<size_t>(num_nodes_in_node, "mesh.nodes_in_node");
+
+        // save the connectivity
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            size_t num_saved = 0;
+            for (size_t node_lid = 0; node_lid < num_nodes_in_node(node_gid); node_lid++) {
+                nodes_in_node(node_gid, num_saved) = temp_nodes_in_nodes(node_gid, num_saved);
+
+                // increment the number of nodes in node saved
+                num_saved++;
+            } // end for node_lid
+        }); // end parallel for over nodes
+    } // end of node node connectivity
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_connectivity
+    ///
+    /// \brief Calls multiple build connectivity function
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_connectivity()
+    {
+        build_corner_connectivity();
+        printf("Built corner connectivity \n");
+
+        build_elem_elem_connectivity();
+        printf("Built element-element connectivity \n");
+
+        build_patch_connectivity();
+        printf("Built patch connectivity \n");
+
+        build_node_node_connectivity();
+        printf("Built node-node connectivity \n");
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn init_bdy_sets
+    ///
+    /// \brief Initialize memory for boundary sets
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void init_bdy_sets(size_t num_bcs)
+    {
+        // if (num_bcs == 0) {
+        //     printf("ERROR: number of boundary sets = 0, set it = 1");
+        //     num_bcs = 1;
+        // }
+        num_bdy_sets = num_bcs;
+        num_bdy_patches_in_set = DCArrayKokkos<size_t>(num_bcs, "mesh.num_bdy_patches_in_set");
+
+        // bdy_patches_in_set is a raggedRight array, it is allocated 
+        // in tag_bdys fcn after the sparsity is known, see geometry_new.cpp
+
+        return;
+    } // end of init_bdy_sets method
+
+    
+}; // end Mesh_t
+
+#endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
new file mode 100644
index 00000000..dd26b631
--- /dev/null
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <memory>
+#include <mpi.h>
+
+// Include Scotch headers
+#include "scotch.h"
+#include "ptscotch.h"
+
+
+struct initial_mesh_t {
+    int num_elems;                    // Number of elements
+    
+    std::vector<SCOTCH_Num> nodes_in_elem;  // Nodes in an element
+    std::vector<SCOTCH_Num> elems_in_elem;  // Elements in an element
+    
+    std::vector<SCOTCH_Num> verttab;  // Start index in edgetab for each element (size num_elems+1)
+    std::vector<SCOTCH_Num> edgetab;  // Adjacency info: neighboring element indices
+};
+
+
+int main(int argc, char** argv) {
+
+    initial_mesh_t initial_mesh;
+
+
+
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
new file mode 100644
index 00000000..03fee676
--- /dev/null
+++ b/examples/mesh_decomp/mesh_io.h
@@ -0,0 +1,4894 @@
+/**********************************************************************************************
+© 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+#ifndef FIERRO_IO_H
+#define FIERRO_IO_H
+
+#include "matar.h"
+#include "mesh.h"
+#include "state.h"
+#include "simulation_parameters.h"
+#include "region.h"
+#include "string_utils.h"
+
+#include <map>
+#include <memory>
+#include <cstring>
+#include <sys/stat.h>
+#include <iostream>
+#include <regex>    // for string pattern recoginition
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn get_id
+///
+/// \brief This gives the index value of the point or the elem
+///
+/// Assumes that the grid has an i,j,k structure
+/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1)
+/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j
+///
+/// \param i index
+/// \param j index
+/// \param k index
+/// \param Number of i indices
+/// \param Number of j indices
+///
+/////////////////////////////////////////////////////////////////////////////
+inline int get_id(int i, int j, int k, int num_i, int num_j)
+{
+    return i + j * num_i + k * num_i * num_j;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn PointIndexFromIJK
+///
+/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an 
+/// offset into the local connectivity (PointIds) array. The order parameter
+/// must point to an array of 3 integers specifying the order along each 
+/// axis of the hexahedron.
+///
+/////////////////////////////////////////////////////////////////////////////
+inline int PointIndexFromIJK(int i, int j, int k, const int* order)
+{
+    bool ibdy = (i == 0 || i == order[0]);
+    bool jbdy = (j == 0 || j == order[1]);
+    bool kbdy = (k == 0 || k == order[2]);
+    // How many boundaries do we lie on at once?
+    int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0);
+
+    if (nbdy == 3) { // Vertex DOF
+        // ijk is a corner node. Return the proper index (somewhere in [0,7]):
+        return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0);
+    }
+
+    int offset = 8;
+    if (nbdy == 2) { // Edge DOF
+        if (!ibdy) { // On i axis
+            return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
+        }
+        if (!jbdy) { // On j axis
+            return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
+        }
+        // !kbdy, On k axis
+        offset += 4 * (order[0] - 1) + 4 * (order[1] - 1);
+        return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset;
+    }
+
+    offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1);
+    if (nbdy == 1) { // Face DOF
+        if (ibdy) { // On i-normal face
+            return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset;
+        }
+        offset += 2 * (order[1] - 1) * (order[2] - 1);
+        if (jbdy) { // On j-normal face
+            return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset;
+        }
+        offset += 2 * (order[2] - 1) * (order[0] - 1);
+        // kbdy, On k-normal face
+        return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset;
+    }
+
+    // nbdy == 0: Body DOF
+    offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1));
+    return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1)));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn get_id_device
+///
+/// \brief This gives the index value of the point or the elem
+///
+/// Assumes that the grid has an i,j,k structure
+/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1)
+/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j
+///
+/// \param i index
+/// \param j index
+/// \param k index
+/// \param Number of i indices
+/// \param Number of j indices
+///
+/////////////////////////////////////////////////////////////////////////////
+KOKKOS_INLINE_FUNCTION
+int get_id_device(int i, int j, int k, int num_i, int num_j)
+{
+    return i + j * num_i + k * num_i * num_j;
+}
+
+
+//-------
+// word is the field name e.g., Offsets, connectivity, etc.
+// stop is the phrase to stop extracting values
+template <typename T>
+inline bool extract_values_xml(T *values_xml,
+                        const std::string& word,
+                        const std::string& stop,
+                        std::ifstream& in,
+                        size_t& size)
+{
+
+        bool found = false;
+
+        std::string line;
+
+        size_t i = 0;
+
+        // Read the file line by line looking for specified word
+        while (std::getline(in, line)) {
+
+            if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line
+                found = true;
+            } 
+            if(found) {
+
+                // loop over the lines in the file, extracting the values of the field corresponding to the word
+                while (std::getline(in, line)){  
+                
+                    std::istringstream iss(line);  // Create a stream from the line
+
+                    // extract the individual values from the stream
+                    T value;
+                    while (iss >> value) {
+                        values_xml[i] = value;
+                        i++;
+                    } // end while
+
+                    if (line.find(stop) != std::string::npos) { // Check if the stop word is in the line
+                        break;
+                    } // end if
+
+                } // end while
+
+                if(found) break;
+
+            } // end if found
+
+        } // end while
+
+        size = i;
+
+        return found;
+
+} // end function
+
+
+// find the number of points and number of cells in the mesh
+inline bool extract_num_points_and_cells_xml(int& numberOfPoints,
+                                      int& numberOfCells,
+                                      std::ifstream& in)
+{
+    bool found = false;
+
+    std::string line;
+
+        
+    // Read the file line by line looking for NumberOfPoints
+    while (std::getline(in, line)) {
+        
+        std::string word = "NumberOfPoints=";  // A portion of a word
+
+        if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line
+            found = true;
+        }
+        if(found) {
+            // Define regex pattern to match the attributes and capture values
+            std::regex pattern(R"(NumberOfPoints=\"(\d+)\" NumberOfCells=\"(\d+)\")");
+            std::smatch match;
+
+            if (std::regex_search(line, match, pattern)) {
+                //std::cout << "Number of nodes in mesh file: " << match[1] << std::endl;
+                //std::cout << "Number of cells in mesh file: " << match[2] << std::endl;
+
+                numberOfPoints = std::stoi(match[1].str());
+                numberOfCells = std::stoi(match[2].str());
+
+            } else {
+                std::cout << "Error reading the number of points and cells in the mesh!" << std::endl;
+            }
+            
+            break;
+        } // end if
+        
+    } // end while
+
+    return found;
+
+} // end function
+
+
+//    8  = pixal i,j,k linear quad ording
+//    9  = linear quad ensight ordering
+//    11 = voxel i,j,k linear hex ording
+//    12 = linear ensight hex ordering
+//    72 = VTK_LAGRANGE_HEXAHEDRON
+namespace element_types
+{
+    enum element_name
+    {
+        linear_quad_ijk = 8,
+        linear_quad = 9,
+        linear_hex_ijk = 11,
+        linear_hex = 12,
+        arbitrary_hex = 72
+    };
+}
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \class MeshReader
+///
+/// \brief Class for simplifying reading meshes
+///
+/// This class contains the requisite functions required to read different
+/// mesh formats. The idea is to set the mesh file name, and parse the
+/// extension to decide which reader to use. Currently, only ensight .geo
+/// files are supported.
+///
+/////////////////////////////////////////////////////////////////////////////
+class MeshReader
+{
+private:
+    // Handy structs for parsing input meshes
+    struct Node {
+        int id;
+        double x, y, z;
+    };
+
+    struct Element {
+        int id;
+        std::vector<int> connectivity; 
+    };
+
+public:
+
+    char* mesh_file_ = NULL;
+
+    MeshReader() {} // Simulation_Parameters& _simparam);
+
+    ~MeshReader() = default;
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn set_mesh_file
+    ///
+    /// \brief Sets the mesh file path for reading in a mesh
+    ///
+    /// \param Path to mesh file
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void set_mesh_file(char* MESH)
+    {
+        mesh_file_ = MESH;
+    }
+
+    // Reads and initializes the mesh and geometric state entities
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_mesh
+    ///
+    /// \brief Read mesh from file
+    ///
+    /// \param Simulation mesh
+    /// \param Simulation state
+    /// \param Number of dimensions
+    ///
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_mesh(Mesh_t& mesh,
+                   State_t& State,
+                   mesh_input_t& mesh_inps,
+                   int      num_dims)
+    {
+        if (mesh_file_ == NULL) {
+            throw std::runtime_error("**** No mesh path given for read_mesh ****");
+        }
+
+        std::ifstream file(mesh_file_);
+        if (file.is_open()) {
+            std::cout << "The file exists." << std::endl;
+            file.close();
+        } else {
+            throw std::runtime_error("**** Mesh path given does not exists ****");
+        }
+
+        // Check mesh file extension
+        // and read based on extension
+        std::string filePathStr(mesh_file_);
+        std::string extension;
+
+        size_t pos = filePathStr.rfind('.');
+        if (pos != std::string::npos) {
+            extension = filePathStr.substr(pos + 1);
+        } else {
+            extension =  "";
+        }
+
+        std::cout << "File extension is: " << extension << std::endl;
+
+        if(extension == "geo"){ // Ensight meshfile extension
+            read_ensight_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims);
+        }
+        else if(extension == "inp"){ // Abaqus meshfile extension
+            read_Abaqus_mesh(mesh, State, num_dims);
+        }
+        else if(extension == "vtk"){ // vtk file format
+            read_vtk_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims);
+        }
+        else if(extension == "vtu"){ // vtu file format
+            read_vtu_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims);
+        }
+        else{
+            throw std::runtime_error("**** Mesh file extension not understood ****");
+        }
+
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_ensight_mesh
+    ///
+    /// \brief Read .geo mesh file
+    ///
+    /// \param Simulation mesh
+    /// \param Element state struct
+    /// \param Node state struct
+    /// \param Corner state struct
+    /// \param Number of dimensions
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_ensight_mesh(Mesh_t& mesh,
+                           GaussPoint_t& GaussPoints,
+                           node_t&   node,
+                           corner_t& corner,
+                           mesh_input_t& mesh_inps,
+                           int num_dims)
+    {
+        FILE* in;
+        char  ch;
+
+        size_t num_nodes_in_elem = 1;
+        for (int dim = 0; dim < num_dims; dim++) {
+            num_nodes_in_elem *= 2;
+        }
+
+        // read the mesh    WARNING: assumes a .geo file
+        in = fopen(mesh_file_, "r");
+
+        // skip 8 lines
+        for (int j = 1; j <= 8; j++) {
+            int i = 0;
+            while ((ch = (char)fgetc(in)) != '\n') {
+                i++;
+            }
+        }
+
+        // --- Read in the nodes in the mesh ---
+
+        size_t num_nodes = 0;
+
+        fscanf(in, "%lu", &num_nodes);
+        printf("Number of nodes read in %lu\n", num_nodes);
+
+        
+        mesh.initialize_nodes(num_nodes);
+
+        // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
+        std::vector<node_state> required_node_state = { node_state::coords };
+        node.initialize(num_nodes, num_dims, required_node_state);
+
+        // read the initial mesh coordinates
+        // x-coords
+        for (int node_id = 0; node_id < mesh.num_nodes; node_id++) {
+            fscanf(in, "%le", &node.coords.host(node_id, 0));
+            node.coords.host(node_id, 0)*= mesh_inps.scale_x;
+        }
+
+        // y-coords
+        for (int node_id = 0; node_id < mesh.num_nodes; node_id++) {
+            fscanf(in, "%le", &node.coords.host(node_id, 1));
+            node.coords.host(node_id, 1)*= mesh_inps.scale_y;
+        }
+
+        // z-coords
+        for (int node_id = 0; node_id < mesh.num_nodes; node_id++) {
+            if (num_dims == 3) {
+                fscanf(in, "%le", &node.coords.host(node_id, 2));
+                node.coords.host(node_id, 2)*= mesh_inps.scale_z;
+            }
+            else{
+                double dummy;
+                fscanf(in, "%le", &dummy);
+            }
+        } // end for
+
+
+        // Update device nodal positions
+        node.coords.update_device();
+
+        ch = (char)fgetc(in);
+
+        // skip 1 line
+        for (int j = 1; j <= 1; j++) {
+            int i = 0;
+            while ((ch = (char)fgetc(in)) != '\n') {
+                i++;
+            }
+        }
+
+        // --- read in the elements in the mesh ---
+        size_t num_elem = 0;
+
+        fscanf(in, "%lu", &num_elem);
+        printf("Number of elements read in %lu\n", num_elem);
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elem, num_dims);
+        // GaussPoints.initialize(num_elem, 3); // always 3D here, even for 2D
+
+        
+        // for each cell read the list of associated nodes
+        for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) {
+            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+                fscanf(in, "%lu", &mesh.nodes_in_elem.host(elem_gid, node_lid));  // %d vs zu
+
+                // shift to start node index space at 0
+                mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1;
+            }
+        }
+
+        // Convert from ensight to IJK mesh
+        int convert_ensight_to_ijk[8];
+        convert_ensight_to_ijk[0] = 0;
+        convert_ensight_to_ijk[1] = 1;
+        convert_ensight_to_ijk[2] = 3;
+        convert_ensight_to_ijk[3] = 2;
+        convert_ensight_to_ijk[4] = 4;
+        convert_ensight_to_ijk[5] = 5;
+        convert_ensight_to_ijk[6] = 7;
+        convert_ensight_to_ijk[7] = 6;
+
+        int tmp_ijk_indx[8];
+
+        for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) {
+            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+                tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]);
+            }
+
+            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){
+                mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid];
+            }
+        }
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+        // initialize corner variables
+        int num_corners = num_elem * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+        // corner.initialize(num_corners, num_dims);
+
+        // Close mesh input file
+        fclose(in);
+
+        // Build connectivity
+        mesh.build_connectivity();
+
+        return;
+    } // end read ensight mesh
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_Abaqus_mesh
+    ///
+    /// \brief Read .inp mesh file
+    ///
+    /// \param Simulation mesh
+    /// \param Simulation state
+    /// \param Node state struct
+    /// \param Number of dimensions
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_Abaqus_mesh(Mesh_t& mesh,
+                          State_t& State,
+                          int num_dims)
+    {
+
+        std::cout<<"Reading abaqus input file for mesh"<<std::endl;
+        std::ifstream inputFile(mesh_file_);
+        if (!inputFile.is_open()) {
+            std::cerr << "Failed to open the file." << std::endl;
+
+        }
+
+        std::vector<Node> nodes;
+        std::vector<Element> elements;
+
+        std::string line;
+        bool readingNodes = false;
+        bool readingElements = false;
+
+        while (std::getline(inputFile, line)) {
+            if (line.find("*Node") != std::string::npos) {
+                readingNodes = true;
+                std::cout<<"Found *Node"<<std::endl;
+
+            } 
+            else if (readingNodes && !line.find("*") ) { // End of nodes
+                readingNodes = false;
+            } 
+            else if (readingNodes) {
+                // std::cout<<"Reading Nodes"<<std::endl;
+                std::istringstream iss(line);
+                std::ws(iss); // Skip leading whitespace
+                std::string token;
+                Node node;
+
+                if (!(iss >> node.id && std::getline(iss, token, ',') && iss >> node.x &&
+                    std::getline(iss, token, ',') && iss >> node.y &&
+                    std::getline(iss, token, ',') && iss >> node.z)) {
+                    std::cerr << "Failed to parse line: " << line << std::endl;
+                    continue; // Skip this line if parsing failed
+                }
+                nodes.push_back(node);
+            }
+
+            if (line.find("*Element") != std::string::npos) {
+                readingElements = true;
+                std::cout<<"Found *Element*"<<std::endl;
+            } 
+            else if (readingElements &&  !line.find("*") ) { // End of elements
+                readingElements = false;
+            } 
+            else if (readingElements ) {
+                std::istringstream iss(line);
+                Element element;
+                std::string token;
+
+                if (!(iss >> element.id)){
+                    std::cout << "Failed to parse line: " << line << std::endl;
+                    continue; // Skip this line if parsing failed
+                } 
+
+                while ((std::getline(iss, token, ','))) { 
+                    // Now extract the integer, ignoring any trailing whitespace
+                    int val;
+                    iss >> val;
+                    element.connectivity.push_back(val);
+                }
+
+                // Convert from abaqus to IJK mesh
+                int convert_abq_to_ijk[8];
+                convert_abq_to_ijk[0] = 0;
+                convert_abq_to_ijk[1] = 1;
+                convert_abq_to_ijk[2] = 3;
+                convert_abq_to_ijk[3] = 2;
+                convert_abq_to_ijk[4] = 4;
+                convert_abq_to_ijk[5] = 5;
+                convert_abq_to_ijk[6] = 7;
+                convert_abq_to_ijk[7] = 6;
+
+                int tmp_ijk_indx[8];
+
+                for (int node_lid = 0; node_lid < 8; node_lid++) {
+                    tmp_ijk_indx[node_lid] = element.connectivity[convert_abq_to_ijk[node_lid]];
+                }
+
+                for (int node_lid = 0; node_lid < 8; node_lid++){
+                    element.connectivity[node_lid] = tmp_ijk_indx[node_lid];
+                }
+
+                elements.push_back(element);
+            }
+        }
+
+        inputFile.close();
+
+        size_t num_nodes = nodes.size();
+
+        printf("Number of nodes read in %lu\n", num_nodes);
+
+        // initialize node variables
+        mesh.initialize_nodes(num_nodes);
+
+        // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers
+        std::vector<node_state> required_node_state = { node_state::coords };
+
+        State.node.initialize(num_nodes, num_dims, required_node_state);
+
+
+        // Copy nodes to mesh
+        for(int node_gid = 0; node_gid < num_nodes; node_gid++){
+            State.node.coords.host(node_gid, 0) = nodes[node_gid].x;
+            State.node.coords.host(node_gid, 1) = nodes[node_gid].y;
+            State.node.coords.host(node_gid, 2) = nodes[node_gid].z;
+        }
+
+        // Update device nodal positions
+        State.node.coords.update_device();
+
+
+        // --- read in the elements in the mesh ---
+        size_t num_elem = elements.size();
+        printf("Number of elements read in %lu\n", num_elem);
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elem, num_dims);
+
+
+        // for each cell read the list of associated nodes
+        for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) {
+            for (int node_lid = 0; node_lid < 8; node_lid++) {
+                mesh.nodes_in_elem.host(elem_gid, node_lid) = elements[elem_gid].connectivity[node_lid];
+
+                // shift to start node index space at 0
+                mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1;
+            }
+        }
+
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+        // initialize corner variables
+        int num_corners = num_elem * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+        // State.corner.initialize(num_corners, num_dims);
+
+        // Build connectivity
+        mesh.build_connectivity();
+    } // end read abaqus mesh
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_vtk_mesh
+    ///
+    /// \brief Read ASCII .vtk mesh file
+    ///
+    /// \param Simulation mesh
+    /// \param Simulation state
+    /// \param Node state struct
+    /// \param Number of dimensions
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_vtk_mesh(Mesh_t& mesh,
+                    GaussPoint_t& GaussPoints,
+                    node_t&   node,
+                    corner_t& corner,
+                    mesh_input_t& mesh_inps,
+                    int num_dims)
+    {
+
+        std::cout<<"Reading VTK mesh"<<std::endl;
+    
+        int i;           // used for writing information to file
+        int node_gid;    // the global id for the point
+        int elem_gid;     // the global id for the elem
+
+        size_t num_nodes_in_elem = 1;
+        for (int dim = 0; dim < num_dims; dim++) {
+            num_nodes_in_elem *= 2;
+        }
+        
+
+        std::string token;
+        
+        bool found = false;
+        
+        std::ifstream in;  // FILE *in;
+        in.open(mesh_file_);
+        
+
+        // look for POINTS
+        i = 0;
+        while (found==false) {
+            std::string str;
+            std::string delimiter = " ";
+            std::getline(in, str);
+            std::vector<std::string> v = split (str, delimiter);
+            
+            // looking for the following text:
+            //      POINTS %d float
+            if(v[0] == "POINTS"){
+                size_t num_nodes = std::stoi(v[1]);
+                printf("Number of nodes read in %zu\n", num_nodes);
+                mesh.initialize_nodes(num_nodes);
+
+                std::vector<node_state> required_node_state = { node_state::coords };
+                node.initialize(num_nodes, num_dims, required_node_state);
+                
+                found=true;
+            } // end if
+            
+            
+            if (i>1000){
+                std::cerr << "ERROR: Failed to find POINTS in file" << std::endl;
+                break;
+            } // end if
+            
+            i++;
+        } // end while
+        
+        // read the node coordinates
+        for (node_gid=0; node_gid<mesh.num_nodes; node_gid++){
+            
+            std::string str;
+            std::getline(in, str);
+            
+            std::string delimiter = " ";
+            std::vector<std::string> v = split (str, delimiter);
+            
+            // save the nodal coordinates
+            node.coords.host(node_gid, 0) = mesh_inps.scale_x*std::stod(v[0]); // double
+            node.coords.host(node_gid, 1) = mesh_inps.scale_y*std::stod(v[1]); // double
+            if(num_dims==3){
+                node.coords.host(node_gid, 2) = mesh_inps.scale_z*std::stod(v[2]); // double
+            }
+            
+        } // end for nodes
+
+
+        // Update device nodal positions
+        node.coords.update_device();
+        
+
+        found=false;
+
+        // look for CELLS
+        i = 0;
+        size_t num_elem = 0;
+        while (found==false) {
+            std::string str;
+            std::getline(in, str);
+            
+            std::string delimiter = " ";
+            std::vector<std::string> v = split (str, delimiter);
+            std::cout << v[0] << std::endl; // printing
+            
+            // looking for the following text:
+            //      CELLS num_elem size
+            if(v[0] == "CELLS"){
+                num_elem = std::stoi(v[1]);
+                printf("Number of elements read in %zu\n", num_elem);
+
+                // initialize elem variables
+                mesh.initialize_elems(num_elem, num_dims);
+                
+                found=true;
+            } // end if
+            
+            
+            if (i>1000){
+                printf("ERROR: Failed to find CELLS \n");
+                break;
+            } // end if
+            
+            i++;
+        } // end while
+        
+        
+        // read the node ids in the element
+        for (elem_gid=0; elem_gid<num_elem; elem_gid++) {
+            
+            std::string str;
+            std::getline(in, str);
+            
+            std::string delimiter = " ";
+            std::vector<std::string> v = split (str, delimiter);
+            num_nodes_in_elem = std::stoi(v[0]);
+            
+            for (size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++){
+                mesh.nodes_in_elem.host(elem_gid, node_lid) = std::stod(v[node_lid+1]);
+                //printf(" %zu ", elem_point_list(elem_gid,node_lid) ); // printing
+            }
+            //printf("\n"); // printing
+            
+        } // end for
+
+        // Convert from ensight to IJK mesh
+        size_t convert_ensight_to_ijk[8];
+        convert_ensight_to_ijk[0] = 0;
+        convert_ensight_to_ijk[1] = 1;
+        convert_ensight_to_ijk[2] = 3;
+        convert_ensight_to_ijk[3] = 2;
+        convert_ensight_to_ijk[4] = 4;
+        convert_ensight_to_ijk[5] = 5;
+        convert_ensight_to_ijk[6] = 7;
+        convert_ensight_to_ijk[7] = 6;
+
+        size_t tmp_ijk_indx[8];
+
+        for (size_t elem_gid = 0; elem_gid < num_elem; elem_gid++) {
+            for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+                tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]);
+            }
+
+            for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){
+                mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid];
+            }
+        }
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+
+        // initialize corner variables
+        size_t num_corners = num_elem * num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+
+
+        // Build connectivity
+        mesh.build_connectivity();
+
+
+        found=false;
+
+        printf("\n");
+        
+        
+        // look for CELL_TYPE
+        i = 0;
+        size_t elem_type = 0;
+        while (found==false) {
+            std::string str;
+            std::string delimiter = " ";
+            std::getline(in, str);
+            std::vector<std::string> v = split (str, delimiter);
+            
+            // looking for the following text:
+            //      CELLS num_elem size
+            if(v[0] == "CELL_TYPES"){
+
+                std::getline(in, str);
+                elem_type = std::stoi(str);
+                
+                found=true;
+            } // end if
+            
+            
+            if (i>1000){
+                printf("ERROR: Failed to find elem_TYPE \n");
+                break;
+            } // end if
+            
+            i++;
+        } // end while
+        printf("Element type = %zu \n", elem_type);
+        // elem types:
+        // linear hex = 12, linear quad = 9
+        found=false;
+        
+        
+        if(num_nodes_in_elem==8 & elem_type != 12) {
+            printf("Wrong element type of %zu \n", elem_type);
+            std::cerr << "ERROR: incorrect element type in VTK file" << std::endl;
+        }
+        
+        in.close();
+        
+    } // end of VTKread function
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_vtu_mesh
+    ///
+    /// \brief Read ASCII .vtu mesh file
+    ///
+    /// \param Simulation mesh
+    /// \param Simulation state
+    /// \param Node state struct
+    /// \param Number of dimensions
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_vtu_mesh(Mesh_t& mesh,
+                    GaussPoint_t& GaussPoints,
+                    node_t&   node,
+                    corner_t& corner,
+                    mesh_input_t& mesh_inps,
+                    int num_dims)
+    {
+
+        std::cout<<"Reading VTU file in a multiblock VTK mesh"<<std::endl;
+    
+        int i;           // used for writing information to file
+        int node_gid;    // the global id for the point
+        int elem_gid;    // the global id for the elem
+
+
+        //
+        int Pn_order = mesh_inps.p_order;
+        size_t num_nodes_in_elem = 1;
+        for (int dim = 0; dim < num_dims; dim++) {
+            num_nodes_in_elem *= (Pn_order + 1);
+        }
+        
+        bool found;
+        
+        std::ifstream in;  // FILE *in;
+        in.open(mesh_file_);
+        
+
+        // --- extract the number of points and cells from the XML file ---
+        int num_nodes;
+        int num_elems;
+        found = extract_num_points_and_cells_xml(num_nodes,
+                                                 num_elems,
+                                                 in);
+        if(found==false){
+            throw std::runtime_error("ERROR: number of points and/or cells not found in the XML file!");
+            //std::cout << "ERROR: number of points and cells not found in the XML file!" << std::endl;
+        }
+        std::cout << "Number of nodes in the mesh file: " << num_nodes << std::endl;
+        std::cout << "Number of elements in the mesh file: " << num_elems << std::endl;
+        
+        //------------------------------------
+        // allocate mesh class nodes and elems
+        mesh.initialize_nodes(num_nodes);
+        mesh.initialize_elems(num_elems, num_dims);
+
+        //------------------------------------
+        // allocate node coordinate state
+        std::vector<node_state> required_node_state = { node_state::coords };
+        node.initialize(num_nodes, num_dims, required_node_state);
+
+        //------------------------------------
+        // allocate the elem object id array
+        mesh_inps.object_ids = DCArrayKokkos <int> (num_elems, "ObjectIDs");
+
+
+        // ------------------------
+        // Mesh file storage order:
+        //     objectId
+        //     Points
+        //     connectivity
+        //     offsets
+        //     types
+        // ------------------------
+        
+        // temporary arrays
+        DCArrayKokkos<double> node_coords(num_nodes,3, "node_coords_vtu_file"); // always 3 with vtu files
+        DCArrayKokkos<int> connectivity(num_elems,num_nodes_in_elem, "connectivity_vtu_file");
+        DCArrayKokkos<int> elem_types(num_elems, "elem_types_vtu_file"); // element types
+
+
+        // for all fields, we stop recording when we get to "<"
+        std::string stop = "<";
+
+        // the size of 1D storage from reading the mesh file
+        size_t size;
+
+        // ---
+        //  Object ids
+        // ---
+
+        // the object id in the element
+        // array dims are (num_elems)
+        found = extract_values_xml(mesh_inps.object_ids.host.pointer(),
+                                "\"ObjectId\"",
+                                stop,
+                                in,
+                                size);
+        if(found==false){
+            throw std::runtime_error("ERROR: ObjectIDs were not found in the XML file!");
+            //std::cout << "ERROR: ObjectIDs were not found in the XML file!" << std::endl;
+        }
+        mesh_inps.object_ids.update_device();
+
+
+        // ---
+        //  Nodal coordinates of mesh
+        // ---
+
+        // coordinates of the node
+        // array dims are (num_nodes,dims)
+        // must use the quotes around Points to read the point values
+        found = extract_values_xml(node_coords.host.pointer(),
+                                "\"Points\"",
+                                stop,
+                                in,
+                                size);
+        if(found==false){
+            throw std::runtime_error("**** ERROR: mesh nodes were not found in the XML file! ****");
+            //std::cout << "ERROR: mesh nodes were not found in the XML file!" << std::endl;
+        }
+        if (size!=num_nodes*3){
+            throw std::runtime_error("ERROR: failed to read all the mesh nodes!");
+            //std::cout << "ERROR: failed to read all the mesh nodes!" << std::endl;
+        }
+        node_coords.update_device();
+
+        // dimensional scaling of the mesh
+        const double scl_x = mesh_inps.scale_x;
+        const double scl_y = mesh_inps.scale_y;
+        const double scl_z = mesh_inps.scale_z;
+
+        // save the node coordinates to the state array
+        FOR_ALL(node_gid, 0, mesh.num_nodes, {
+            
+            // save the nodal coordinates
+            node.coords(node_gid, 0) = scl_x*node_coords(node_gid, 0); // double
+            node.coords(node_gid, 1) = scl_y*node_coords(node_gid, 1); // double
+            if(num_dims==3){
+                node.coords(node_gid, 2) = scl_z*node_coords(node_gid, 2); // double
+            }
+
+        }); // end for parallel nodes
+        node.coords.update_host();
+
+
+        // ---
+        //  Nodes in the element 
+        // ---
+
+        // fill temporary nodes in the element array
+        // array dims are (num_elems,num_nodes_in_elem)
+        found = extract_values_xml(connectivity.host.pointer(),
+                                "\"connectivity\"",
+                                stop,
+                                in,
+                                size);
+        if(found==false){
+            std::cout << "ERROR: mesh connectivity was not found in the XML file!" << std::endl;
+        }
+        connectivity.update_device();
+
+        // array dims are the (num_elems) 
+        //    8  = pixal i,j,k linear quad format
+        //    9  = linear quad ensight ordering
+        //    12 = linear ensight hex ordering
+        //    72 = VTK_LAGRANGE_HEXAHEDRON
+        // ....
+        found = extract_values_xml(elem_types.host.pointer(),
+                                "\"types\"",
+                                stop,
+                                in,
+                                size);
+        if(found==false){
+            std::cout << "ERROR: element types were not found in the XML file!" << std::endl;
+        }
+        elem_types.update_device();
+
+        // check that the element type is supported by Fierro
+        FOR_ALL (elem_gid, 0, mesh.num_elems, {
+            if(elem_types(elem_gid) == element_types::linear_quad || 
+               elem_types(elem_gid) == element_types::linear_hex_ijk ||
+               elem_types(elem_gid) == element_types::linear_hex ||
+               elem_types(elem_gid) == element_types::arbitrary_hex )
+            {
+                // at least one of them is true
+            }
+            else 
+            {
+               // unknown element used
+               Kokkos::abort("Unknown element type in the mesh \n");
+            }
+        });
+
+        // Convert from ensight linear hex to a IJK mesh
+        CArrayKokkos <size_t> convert_ensight_to_ijk(8, "convert_ensight_to_ijk");
+
+        // Convert the arbitrary order hex to a IJK mesh
+        DCArrayKokkos <size_t> convert_pn_vtk_to_ijk(mesh.num_nodes_in_elem, "convert_pn_vtk_to_ijk");
+
+        //build the connectivity for element type 12
+        // elem_types.host(0)
+        switch(elem_types.host(0)){
+
+            case element_types::linear_quad:
+                // the node order is correct, no changes required
+
+                FOR_ALL (elem_gid, 0, mesh.num_elems, {
+                    
+                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
+                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,node_lid);
+                    }
+                    
+                }); // end for
+
+                break;
+                // next case
+
+            case element_types::linear_hex_ijk:
+
+                // read the node ids in the element, no maps required
+                FOR_ALL (elem_gid, 0, mesh.num_elems, {
+                    
+                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
+                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,node_lid);
+                    }
+                    
+                }); // end for
+
+                break;
+                // next case
+
+            case element_types::linear_hex:
+
+                RUN({
+                    convert_ensight_to_ijk(0) = 0;
+                    convert_ensight_to_ijk(1) = 1;
+                    convert_ensight_to_ijk(2) = 3;
+                    convert_ensight_to_ijk(3) = 2;
+                    convert_ensight_to_ijk(4) = 4;
+                    convert_ensight_to_ijk(5) = 5;
+                    convert_ensight_to_ijk(6) = 7;
+                    convert_ensight_to_ijk(7) = 6;
+                });
+
+                // read the node ids in the element
+                FOR_ALL (elem_gid, 0, mesh.num_elems, {
+                    
+                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
+                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,convert_ensight_to_ijk(node_lid));
+                    }
+                    
+                }); // end for
+
+                break;
+                // next case
+
+            case element_types::arbitrary_hex:
+
+                // re-order the nodes to be in i,j,k format for Fierro
+                size_t this_node = 0;
+                for (int k=0; k<=Pn_order; k++){
+                    for (int j=0; j<=Pn_order; j++){
+                        for (int i=0; i<=Pn_order; i++){
+                            
+                            // convert this_node index to the FE index convention
+                            int order[3] = {Pn_order, Pn_order, Pn_order};
+                            int this_index = PointIndexFromIJK(i, j, k, order);
+                            
+                            // store the points in this elem according the the finite
+                            // element numbering convention
+                            convert_pn_vtk_to_ijk.host(this_index) = this_node;
+                            
+                            // increment the point counting index
+                            this_node = this_node + 1;
+                            
+                        } // end for icount
+                    } // end for jcount
+                }  // end for kcount
+                convert_pn_vtk_to_ijk.update_device();
+                Kokkos::fence();
+
+                // read the node ids in the element
+                FOR_ALL (elem_gid, 0, mesh.num_elems, {
+                    
+                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
+                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,convert_pn_vtk_to_ijk(node_lid));
+                    }
+                    
+                }); // end for
+
+                break;
+                // next case
+
+        } // end switch
+        mesh.nodes_in_elem.update_host();
+
+
+        // initialize corner variables
+        size_t num_corners = mesh.num_elems * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+
+
+        // Build connectivity
+        mesh.build_connectivity();
+
+
+        in.close();
+            
+    } // end of VTMread function
+
+
+}; // end of Mesh reader class
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \class MeshBuilder
+///
+/// \brief Class for building simple meshes
+///
+/// This class contains the requisite functions required to build simple
+/// 2D and 3D Box meshes as well as 2D polar meshes. It uses the parsed
+/// simulation parameters to decide what type of mesh to build.
+///
+/////////////////////////////////////////////////////////////////////////////
+class MeshBuilder
+{
+public:
+
+    MeshBuilder() {}
+
+    ~MeshBuilder()
+    {
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_mesh
+    ///
+    /// \brief Build a mesh for Fierro based on the input instructions
+    ///
+    /// \param Simulation mesh that is built
+    /// \param Element state data
+    /// \param Node state data
+    /// \param Corner state data
+    /// \param Simulation parameters
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_mesh(Mesh_t& mesh,
+        GaussPoint_t& GaussPoints,
+        node_t&   node,
+        corner_t& corner,
+        SimulationParameters_t& SimulationParamaters)
+    {
+        if (SimulationParamaters.mesh_input.num_dims == 2) {
+            if (SimulationParamaters.mesh_input.type == mesh_input::Polar) {
+                build_2d_polar(mesh, GaussPoints, node, corner, SimulationParamaters);
+            }
+            else if (SimulationParamaters.mesh_input.type == mesh_input::Box) {
+                build_2d_box(mesh, GaussPoints, node, corner, SimulationParamaters);
+            }
+            else{
+                std::cout << "**** 2D MESH TYPE NOT SUPPORTED **** " << std::endl;
+                std::cout << "Valid options are: " << std::endl;
+                auto map = mesh_input_type_map;
+                for (const auto& pair : map) {
+                    std::cout << "\t" << pair.first << std::endl;
+                }
+                throw std::runtime_error("**** 2D MESH TYPE NOT SUPPORTED ****");
+            }
+        }
+        else if (SimulationParamaters.mesh_input.num_dims == 3) {
+            build_3d_box(mesh, GaussPoints, node, corner, SimulationParamaters);
+        }
+        else{
+            throw std::runtime_error("**** ONLY 2D RZ OR 3D MESHES ARE SUPPORTED ****");
+        }
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_2d_box
+    ///
+    /// \brief Builds an unstructured 2D rectilinear mesh
+    ///
+    /// \param Simulation mesh that is built
+    /// \param Element state data
+    /// \param Node state data
+    /// \param Corner state data
+    /// \param Simulation parameters
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_2d_box(Mesh_t& mesh,
+        GaussPoint_t& GaussPoints,
+        node_t&   node,
+        corner_t& corner,
+        SimulationParameters_t& SimulationParamaters) const
+    {
+        printf("Creating a 2D box mesh \n");
+
+        const int num_dim = 2;
+
+        const double lx = SimulationParamaters.mesh_input.length[0];
+        const double ly = SimulationParamaters.mesh_input.length[1];
+
+        const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0];
+        const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1];
+
+        const int num_points_i = num_elems_i + 1; // num points in x
+        const int num_points_j = num_elems_j + 1; // num points in y
+
+        const int num_nodes = num_points_i * num_points_j;
+
+        const double dx = lx / ((double)num_elems_i);  // len/(num_elems_i)
+        const double dy = ly / ((double)num_elems_j);  // len/(num_elems_j)
+
+        const int num_elems = num_elems_i * num_elems_j;
+
+        std::vector<double> origin(num_dim);
+        // SimulationParamaters.mesh_input.origin.update_host();
+        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
+
+        // --- 2D parameters ---
+        // const int num_faces_in_elem  = 4;  // number of faces in elem
+        // const int num_points_in_elem = 4;  // number of points in elem
+        // const int num_points_in_face = 2;  // number of points in a face
+        // const int num_edges_in_elem  = 4;  // number of edges in a elem
+
+        // --- mesh node ordering ---
+        // Convert ijk index system to the finite element numbering convention
+        // for vertices in elem
+        auto convert_point_number_in_quad = CArray<int>(4);
+        convert_point_number_in_quad(0) = 0;
+        convert_point_number_in_quad(1) = 1;
+        convert_point_number_in_quad(2) = 3;
+        convert_point_number_in_quad(3) = 2;
+
+        // intialize node variables
+        mesh.initialize_nodes(num_nodes);
+
+        // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers
+        std::vector<node_state> required_node_state = { node_state::coords };
+        node.initialize(num_nodes, num_dim, required_node_state);
+
+        // --- Build nodes ---
+
+        // populate the point data structures
+        for (int j = 0; j < num_points_j; j++) {
+            for (int i = 0; i < num_points_i; i++) {
+                // global id for the point
+                int node_gid = get_id(i, j, 0, num_points_i, num_points_j);
+
+                // store the point coordinates
+                node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
+                node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
+            } // end for i
+        } // end for j
+
+
+        node.coords.update_device();
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elems, num_dim);
+
+        // populate the elem center data structures
+        for (int j = 0; j < num_elems_j; j++) {
+            for (int i = 0; i < num_elems_i; i++) {
+                // global id for the elem
+                int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j);
+
+                // store the point IDs for this elem where the range is
+                // (i:i+1, j:j+1 for a linear quad
+                int this_point = 0;
+
+                for (int jcount = j; jcount <= j + 1; jcount++) {
+                    for (int icount = i; icount <= i + 1; icount++) {
+                        // global id for the points
+                        int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j);
+
+                        // convert this_point index to the FE index convention
+                        int this_index = convert_point_number_in_quad(this_point);
+
+                        // store the points in this elem according the the finite
+                        // element numbering convention
+                        mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+
+                        // increment the point counting index
+                        this_point = this_point + 1;
+                    } // end for icount
+                } // end for jcount
+            } // end for i
+        } // end for j
+
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+        // intialize corner variables
+        int num_corners = num_elems * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+        // corner.initialize(num_corners, num_dim);
+
+        // Build connectivity
+        mesh.build_connectivity();
+    } // end build_2d_box
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_2d_polar
+    ///
+    /// \brief Builds an unstructured 2D polar mesh
+    ///
+    /// \param Simulation mesh that is built
+    /// \param Element state data
+    /// \param Node state data
+    /// \param Corner state data
+    /// \param Simulation parameters
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_2d_polar(Mesh_t& mesh,
+        GaussPoint_t& GaussPoints,
+        node_t&   node,
+        corner_t& corner,
+        SimulationParameters_t& SimulationParamaters) const
+    {
+        printf("Creating a 2D polar mesh \n");
+
+        int num_dim     = 2;
+
+        const double inner_radius = SimulationParamaters.mesh_input.inner_radius;
+        const double outer_radius = SimulationParamaters.mesh_input.outer_radius;
+
+        const double start_angle = PI / 180.0 * SimulationParamaters.mesh_input.starting_angle;
+        const double end_angle   = PI / 180.0 * SimulationParamaters.mesh_input.ending_angle;
+
+        const int num_elems_i = SimulationParamaters.mesh_input.num_radial_elems;
+        const int num_elems_j = SimulationParamaters.mesh_input.num_angular_elems;
+
+        const int num_points_i = num_elems_i + 1; // num points in x
+        const int num_points_j = num_elems_j + 1; // num points in y
+
+        const int num_nodes = num_points_i * num_points_j;
+
+        const double dx = (outer_radius - inner_radius) / ((double)num_elems_i);  // len/(elems)
+        const double dy = (end_angle - start_angle) / ((double)num_elems_j);  // len/(elems)
+
+        const int num_elems = num_elems_i * num_elems_j;
+
+        std::vector<double> origin(num_dim);
+
+        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
+
+        // --- 2D parameters ---
+        // const int num_faces_in_elem  = 4;  // number of faces in elem
+        // const int num_points_in_elem = 4;  // number of points in elem
+        // const int num_points_in_face = 2;  // number of points in a face
+        // const int num_edges_in_elem  = 4;  // number of edges in a elem
+
+        // --- mesh node ordering ---
+        // Convert ijk index system to the finite element numbering convention
+        // for vertices in elem
+        auto convert_point_number_in_quad = CArray<int>(4);
+        convert_point_number_in_quad(0) = 0;
+        convert_point_number_in_quad(1) = 1;
+        convert_point_number_in_quad(2) = 3;
+        convert_point_number_in_quad(3) = 2;
+
+        // intialize node variables
+        mesh.initialize_nodes(num_nodes);
+
+        // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers
+        std::vector<node_state> required_node_state = { node_state::coords };
+        node.initialize(num_nodes, num_dim, required_node_state);
+
+        // populate the point data structures
+        for (int j = 0; j < num_points_j; j++) {
+            for (int i = 0; i < num_points_i; i++) {
+                // global id for the point
+                int node_gid = get_id(i, j, 0, num_points_i, num_points_j);
+
+                double r_i     = inner_radius + (double)i * dx;
+                double theta_j = start_angle + (double)j * dy;
+
+                // store the point coordinates
+                node.coords.host(node_gid, 0) = origin[0] + r_i * cos(theta_j);
+                node.coords.host(node_gid, 1) = origin[1] + r_i * sin(theta_j);
+
+                if(node.coords.host(node_gid, 0) < 0.0){
+                    throw std::runtime_error("**** NODE RADIUS FOR RZ MESH MUST BE POSITIVE ****");
+                }
+
+            } // end for i
+        } // end for j
+
+
+        node.coords.update_device();
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elems, num_dim);
+
+        // populate the elem center data structures
+        for (int j = 0; j < num_elems_j; j++) {
+            for (int i = 0; i < num_elems_i; i++) {
+                // global id for the elem
+                int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j);
+
+                // store the point IDs for this elem where the range is
+                // (i:i+1, j:j+1 for a linear quad
+                int this_point = 0;
+
+                for (int jcount = j; jcount <= j + 1; jcount++) {
+                    for (int icount = i; icount <= i + 1; icount++) {
+                        // global id for the points
+                        int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j);
+
+                        // convert this_point index to the FE index convention
+                        int this_index = convert_point_number_in_quad(this_point);
+
+                        // store the points in this elem according the the finite
+                        // element numbering convention
+                        mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+
+                        // increment the point counting index
+                        this_point = this_point + 1;
+                    } // end for icount
+                } // end for jcount
+            } // end for i
+        } // end for j
+
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+        // intialize corner variables
+        int num_corners = num_elems * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+        // corner.initialize(num_corners, num_dim);
+
+        // Build connectivity
+        mesh.build_connectivity();
+    } // end build_2d_box
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_3d_box
+    ///
+    /// \brief Builds an unstructured 3D rectilinear mesh
+    ///
+    /// \param Simulation mesh that is built
+    /// \param Element state data
+    /// \param Node state data
+    /// \param Corner state data
+    /// \param Simulation parameters
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_3d_box(Mesh_t& mesh,
+        GaussPoint_t& GaussPoints,
+        node_t&   node,
+        corner_t& corner,
+        SimulationParameters_t& SimulationParamaters) const
+    {
+        printf("Creating a 3D box mesh \n");
+
+        const int num_dim = 3;
+
+        // SimulationParamaters.mesh_input.length.update_host();
+        const double lx = SimulationParamaters.mesh_input.length[0];
+        const double ly = SimulationParamaters.mesh_input.length[1];
+        const double lz = SimulationParamaters.mesh_input.length[2];
+
+        // SimulationParamaters.mesh_input.num_elems.update_host();
+        const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0];
+        const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1];
+        const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2];
+
+        const int num_points_i = num_elems_i + 1; // num points in x
+        const int num_points_j = num_elems_j + 1; // num points in y
+        const int num_points_k = num_elems_k + 1; // num points in y
+
+        const int num_nodes = num_points_i * num_points_j * num_points_k;
+
+        const double dx = lx / ((double)num_elems_i);  // len/(num_elems_i)
+        const double dy = ly / ((double)num_elems_j);  // len/(num_elems_j)
+        const double dz = lz / ((double)num_elems_k);  // len/(num_elems_k)
+
+        const int num_elems = num_elems_i * num_elems_j * num_elems_k;
+
+        std::vector<double> origin(num_dim);
+        // SimulationParamaters.mesh_input.origin.update_host();
+        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
+
+        // --- 3D parameters ---
+        // const int num_faces_in_elem  = 6;  // number of faces in elem
+        // const int num_points_in_elem = 8;  // number of points in elem
+        // const int num_points_in_face = 4;  // number of points in a face
+        // const int num_edges_in_elem  = 12; // number of edges in a elem
+
+
+        // initialize mesh node variables
+        mesh.initialize_nodes(num_nodes);
+
+         // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
+        std::vector<node_state> required_node_state = { node_state::coords };
+        node.initialize(num_nodes, num_dim, required_node_state);
+
+        // --- Build nodes ---
+
+        // populate the point data structures
+        for (int k = 0; k < num_points_k; k++) {
+            for (int j = 0; j < num_points_j; j++) {
+                for (int i = 0; i < num_points_i; i++) {
+                    // global id for the point
+                    int node_gid = get_id(i, j, k, num_points_i, num_points_j);
+
+                    // store the point coordinates
+                    node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
+                    node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
+                    node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
+                } // end for i
+            } // end for j
+        } // end for k
+
+
+        node.coords.update_device();
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elems, num_dim);
+
+        // --- Build elems  ---
+
+        // populate the elem center data structures
+        for (int k = 0; k < num_elems_k; k++) {
+            for (int j = 0; j < num_elems_j; j++) {
+                for (int i = 0; i < num_elems_i; i++) {
+                    // global id for the elem
+                    int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
+
+                    // store the point IDs for this elem where the range is
+                    // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
+                    int this_point = 0;
+                    for (int kcount = k; kcount <= k + 1; kcount++) {
+                        for (int jcount = j; jcount <= j + 1; jcount++) {
+                            for (int icount = i; icount <= i + 1; icount++) {
+                                // global id for the points
+                                int node_gid = get_id(icount, jcount, kcount,
+                                                  num_points_i, num_points_j);
+
+                                // convert this_point index to the FE index convention
+                                int this_index = this_point; //convert_point_number_in_Hex(this_point);
+
+                                // store the points in this elem according the the finite
+                                // element numbering convention
+                                mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+
+                                // increment the point counting index
+                                this_point = this_point + 1;
+                            } // end for icount
+                        } // end for jcount
+                    }  // end for kcount
+                } // end for i
+            } // end for j
+        } // end for k
+
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+        // initialize corner variables
+        int num_corners = num_elems * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+        // corner.initialize(num_corners, num_dim);
+
+        // Build connectivity
+        mesh.build_connectivity();
+    } // end build_3d_box
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_3d_HexN_box
+    ///
+    /// \brief Builds an unstructured high order 3D rectilinear mesh
+    ///
+    /// \param Simulation mesh that is built
+    /// \param Element state data
+    /// \param Node state data
+    /// \param Corner state data
+    /// \param Simulation parameters
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_3d_HexN_box(Mesh_t& mesh,
+        GaussPoint_t& GaussPoints,
+        node_t&   node,
+        corner_t& corner,
+        SimulationParameters_t& SimulationParamaters) const
+    {
+        printf(" ***** WARNING::  build_3d_HexN_box not yet implemented\n");
+        const int num_dim = 3;
+
+        // SimulationParamaters.mesh_input.length.update_host();
+        const double lx = SimulationParamaters.mesh_input.length[0];
+        const double ly = SimulationParamaters.mesh_input.length[1];
+        const double lz = SimulationParamaters.mesh_input.length[2];
+
+        // SimulationParamaters.mesh_input.num_elems.update_host();
+        const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0];
+        const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1];
+        const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2];
+
+        // creating zones for the Pn order
+        const int Pn_order = SimulationParamaters.mesh_input.p_order;
+        
+        if (Pn_order > 19) {
+            printf("Fierro DG and RD solvers are only valid for elements up to Pn = 19 \n");
+            return;
+        }
+
+        const int num_zones_i = Pn_order*num_elems_i;
+        const int num_zones_j = Pn_order*num_elems_j;
+        const int num_zones_k = Pn_order*num_elems_k;
+        
+        const int num_points_i = num_zones_i+1; // num points in x accounting for Pn
+        const int num_points_j = num_zones_j+1; // num points in y accounting for Pn
+        const int num_points_k = num_zones_k+1; // num points in y accounting for Pn
+        
+        
+        const double dx = lx/((double)num_zones_i);  // len/(num_zones_i)
+        const double dy = ly/((double)num_zones_j);  // len/(num_zones_j)
+        const double dz = lz/((double)num_zones_k);  // len/(num_zones_k)
+        
+        const int num_elems = num_elems_i*num_elems_j*num_elems_k;
+        // const int num_zones = num_zones_i*num_zones_j*num_zones_k; // accounts for Pn
+
+        std::vector<double> origin(num_dim);
+        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
+
+        // --- 3D parameters ---
+        // const int num_faces_in_zone = 6;   // number of faces in zone
+        // const int num_points_in_zone = 8;  // number of points in zone
+        // const int num_points_in_face = 4;  // number of points in a face
+        
+        // p_order   = 1, 2, 3, 4, 5
+        // num_nodes = 2, 3, 4, 5, 6
+        const int num_1D_points = Pn_order+1;
+        const int num_points_in_elem = num_1D_points*num_1D_points*num_1D_points;
+           
+        
+        // --- elem ---
+        auto elem_coords = CArray <double> (num_elems, num_dim);
+        auto elem_point_list = CArray <int> (num_elems, num_points_in_elem);
+        
+        
+        // --- point ---
+        int num_points = num_points_i * num_points_j * num_points_k;
+        auto pt_coords = CArray <double> (num_points, num_dim);
+
+
+        // --- Build nodes ---
+        
+        // initialize node variables
+        mesh.initialize_nodes(num_points);
+
+        // 
+        std::vector<node_state> required_node_state = { node_state::coords };
+        node.initialize(num_points, num_dim, required_node_state);
+        // populate the point data structures
+        for (int k = 0; k < num_points_k; k++){
+            for (int j = 0; j < num_points_j; j++){
+                for (int i = 0; i < num_points_i; i++){
+
+                
+                    // global id for the point
+                    int node_gid = get_id(i, j, k, num_points_i, num_points_j);
+
+                    // store the point coordinates
+                    node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
+                    node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
+                    node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
+                    
+                } // end for k
+            } // end for i
+        } // end for j
+
+
+        node.coords.update_device();
+
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elems, num_dim);
+
+        // --- Build elems  ---
+        
+        // populate the elem center data structures accounting for Pn
+        for (int k=0; k<num_elems_k; k++){
+            for (int j=0; j<num_elems_j; j++){
+                for (int i=0; i<num_elems_i; i++){
+                  
+                    // global id for the elem
+                    size_t elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
+                    
+                    // store the point IDs for this elem where the range is
+                    // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
+                    // (i:(i+1)*Pn_order, j:(j+1)*Pn_order, k:(k+1)*Pn_order) for a Pn hexahedron
+                    int node_lid = 0;
+                    
+                    int k_local = 0;
+                    for (int kcount=k*Pn_order; kcount<=(k+1)*Pn_order; kcount++){
+                        
+                        int j_local = 0;
+                        for (int jcount=j*Pn_order; jcount<=(j+1)*Pn_order; jcount++){
+                            
+                            int i_local = 0;
+                            for (int icount=i*Pn_order; icount<=(i+1)*Pn_order; icount++){
+                                
+                                // global id for the points
+                                size_t node_gid = get_id(icount, jcount, kcount,
+                                                  num_points_i, num_points_j);
+
+                                // Saved using i,j,k indexing
+                                mesh.nodes_in_elem.host(elem_gid, node_lid) = node_gid;
+                                
+                                // increment the point counting index
+                                node_lid = node_lid + 1;
+                                
+                                i_local++;
+                            } // end for icount
+                            
+                            j_local++;
+                        } // end for jcount
+                        
+                        k_local ++;
+                    }  // end for kcount
+                } // end for i
+            } // end for j
+        } // end for k
+
+        // update device side
+        mesh.nodes_in_elem.update_device();
+
+        // initialize corner variables
+        int num_corners = num_elems * mesh.num_nodes_in_elem;
+        mesh.initialize_corners(num_corners);
+        // corner.initialize(num_corners, num_dim);
+
+        // Build connectivity
+        mesh.build_connectivity();
+
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \class MeshWriter
+///
+/// \brief Class for writing out a mesh with its associated state from Fierro
+///
+/// This class contains the requisite functions required to write out a mesh
+/// with its associated state data from solvers in Fierro.
+///
+/////////////////////////////////////////////////////////////////////////////
+class MeshWriter
+{
+private:
+    int graphics_id = 0;
+
+public:
+
+    MeshWriter() {}
+
+    ~MeshWriter()
+    {
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn writes mesh with the format given in the input.yaml file
+    ///
+    /// \param Simulation mesh
+    /// \param Element related state
+    /// \param Node related state
+    /// \param Corner related state
+    /// \param Simulation input parameters
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_mesh(Mesh_t& mesh,
+        State_t& State,
+        SimulationParameters_t& SimulationParamaters,
+        double dt,
+        double time_value,
+        CArray<double> graphics_times,
+        std::vector<node_state> node_states,
+        std::vector<gauss_pt_state> gauss_pt_states,
+        std::vector<material_pt_state> material_pt_states,
+        const size_t solver_id)
+    {
+
+
+        // node_state is an enum for possible fields (e.g., coords, velocity, etc.), see state.h
+        // gauss_pt_state is an enum for possible fields (e.g., vol, divergence, etc.)
+        // material_pt_state is an enum for possible fields (e.g., den, pres, etc.)
+
+
+        // *******************
+        //  Update host 
+        // *******************
+
+        const size_t num_mats = State.MaterialPoints.num_material_points.size();
+
+        // material point values
+            
+        //  Update host data for mat_pt state
+        for (auto field : material_pt_states){
+            switch(field){
+                // scalar vars to write out
+                case material_pt_state::density:
+                    State.MaterialPoints.den.update_host();
+                    break;
+                case material_pt_state::pressure:
+                    State.MaterialPoints.pres.update_host();
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    State.MaterialPoints.sie.update_host();
+                    break;
+                case material_pt_state::sound_speed:
+                    State.MaterialPoints.sspd.update_host();
+                    break;
+                case material_pt_state::mass:
+                    State.MaterialPoints.mass.update_host();
+                    break;
+                case material_pt_state::volume_fraction:
+                    State.MaterialPoints.volfrac.update_host();
+                    State.MaterialPoints.geo_volfrac.update_host();
+                    break;
+                case material_pt_state::eroded_flag:
+                    State.MaterialPoints.eroded.update_host();
+                    break;
+                // tensor vars to write out
+                case material_pt_state::stress:
+                    State.MaterialPoints.stress.update_host();
+                    break;
+                
+                // additional vars for thermal-mechanical solver
+                case material_pt_state::thermal_conductivity:
+                    State.MaterialPoints.conductivity.update_host();
+                    break;
+                
+                case material_pt_state::specific_heat:
+                    State.MaterialPoints.specific_heat.update_host();
+                    break;
+
+                // add other variables here
+                
+                // not used
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+                default:
+                    std::cout<<"Desired material point state not understood in outputs"<<std::endl;
+            } // end switch
+        } // end for over mat_pt_states
+
+
+
+        // update gauss point values
+        for (auto field : gauss_pt_states){
+            switch(field){
+                // scalar vars to write out
+                case gauss_pt_state::volume:
+                    State.GaussPoints.vol.update_host();
+                    break;
+                case gauss_pt_state::divergence_velocity:
+                    State.GaussPoints.div.update_host();
+                    break;
+                case gauss_pt_state::level_set:
+                    State.GaussPoints.level_set.update_host();
+                    break;      
+
+                // tensor vars to write out
+                case gauss_pt_state::gradient_velocity:
+                    State.GaussPoints.vel_grad.update_host();
+                    break;
+                default:
+                    std::cout<<"Desired Gauss point state not understood in vtk outputs"<<std::endl;
+
+            } // end switch
+        } // end loop
+
+        // nodal values
+        for (auto field : node_states){
+            switch(field){
+                case node_state::mass:
+                    State.node.mass.update_host();
+                    break;
+                case node_state::temp:
+                    State.node.temp.update_host();
+                    break;
+                case node_state::coords:
+                    State.node.coords.update_host();
+                    break;
+                case node_state::velocity:
+                    State.node.vel.update_host();
+                    break;
+                case node_state::gradient_level_set:
+                    State.node.gradient_level_set.update_host();
+                    break;  
+
+                case node_state::force:
+                    break;
+
+                // heat transer vars
+                case node_state::heat_transfer:
+                    break;
+
+            } // end switch
+        } // end for over 
+        Kokkos::fence();
+
+
+        // ******************************************
+        //  Build Material and Element state outputs
+        // ******************************************
+
+        size_t num_mat_pt_scalar_vars = 0;
+        size_t num_mat_pt_tensor_vars = 0;
+            
+        // count the number of material point state vars to write out
+        for (auto field : SimulationParamaters.output_options.output_mat_pt_state){
+            switch(field){
+                // scalar vars to write out
+                case material_pt_state::density:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                case material_pt_state::pressure:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                case material_pt_state::sound_speed:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                case material_pt_state::mass:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                case material_pt_state::volume_fraction:
+                    num_mat_pt_scalar_vars ++; // mat volfrac
+                    num_mat_pt_scalar_vars ++; // geometric volfrac
+                    break;
+                case material_pt_state::eroded_flag:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                // tensor vars to write out
+                case material_pt_state::stress:
+                    num_mat_pt_tensor_vars ++;
+                    break;
+                
+                // additional vars for thermal-mechanical solver
+                case material_pt_state::thermal_conductivity:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+                
+                case material_pt_state::specific_heat:
+                    num_mat_pt_scalar_vars ++;
+                    break;
+
+                // add other variables here
+
+                // not used
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+                default:
+                    std::cout<<"Desired material point state not understood in outputs"<<std::endl;
+            } // end switch
+        } // end for over mat_pt_states
+
+
+
+        size_t num_elem_scalar_vars = 0;
+        size_t num_elem_vector_vars = 0;
+        size_t num_elem_tensor_vars = 0;
+
+        // count the number of element average fields to write out
+        for (auto field : SimulationParamaters.output_options.output_elem_state){
+            switch(field){
+                // scalar vars to write out
+                case material_pt_state::density:
+                    num_elem_scalar_vars ++;
+                    break;
+                case material_pt_state::pressure:
+                    num_elem_scalar_vars ++;
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    num_elem_scalar_vars ++;
+                    break;
+                case material_pt_state::sound_speed:
+                    num_elem_scalar_vars ++;
+                    break;
+                case material_pt_state::mass:
+                    num_elem_scalar_vars ++;
+                    break;
+                // tensor vars to write out
+                case material_pt_state::stress:
+                    num_elem_tensor_vars ++;
+                    break;
+
+                // additional vars for thermal-mechanical solver
+                case material_pt_state::thermal_conductivity:
+                    num_elem_scalar_vars ++;
+                    break;
+                
+                case material_pt_state::specific_heat:
+                    num_elem_scalar_vars ++;
+                    break;
+
+                // add other variables here
+
+                // not used
+                case material_pt_state::volume_fraction:
+                    break;
+                case material_pt_state::eroded_flag:
+                    break;
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+                default:
+                    std::cout<<"Desired material point state not understood in outputs"<<std::endl;
+            } // end switch
+        } // end for over mat_pt_states
+
+
+        size_t num_gauss_pt_scalar_vars = 0;
+        size_t num_gauss_pt_tensor_vars = 0;
+
+        // gauss point values to ouptput
+        for (auto field : SimulationParamaters.output_options.output_gauss_pt_state){
+            switch(field){
+                // scalar vars to write out
+                case gauss_pt_state::volume:
+                    num_gauss_pt_scalar_vars ++;
+                    break;
+                case gauss_pt_state::level_set:
+                    num_gauss_pt_scalar_vars ++;
+                    break;
+                case gauss_pt_state::divergence_velocity:
+                    num_gauss_pt_scalar_vars ++;
+                    break;
+
+                // tensor vars to write out
+                case gauss_pt_state::gradient_velocity:
+                    num_gauss_pt_tensor_vars ++;
+                    break;
+                default:
+                    std::cout<<"Desired Gauss point state not understood in vtk outputs"<<std::endl;
+
+            } // end switch
+        } // end loop
+
+        // add the Gauss point state to the element state
+        num_elem_scalar_vars += num_gauss_pt_scalar_vars;
+        num_elem_tensor_vars += num_gauss_pt_tensor_vars;
+
+
+        // Scalar, vector, and tensor value names associated with a elem
+        std::vector<std::string> elem_scalar_var_names(num_elem_scalar_vars);
+        std::vector<std::string> elem_tensor_var_names(num_elem_tensor_vars);
+
+        // Scalar, vector, and tensor values associated with a material in part elems
+        std::vector<std::string> mat_elem_scalar_var_names(num_mat_pt_scalar_vars);
+        std::vector<std::string> mat_elem_tensor_var_names(num_mat_pt_tensor_vars);
+
+
+        // the ids to access a variable in the mat_scalar_var_name or tensor list
+        int mat_den_id = -1;
+        int mat_pres_id = -1;
+        int mat_sie_id = -1;
+        int mat_sspd_id = -1;
+        int mat_mass_id = -1;
+        int mat_volfrac_id = -1;  
+        int mat_geo_volfrac_id = -1;  // geometric volume fraction of part
+        int mat_eroded_id = -1;
+        int mat_stress_id = -1;
+
+        int mat_conductivity_id = -1;
+        int mat_specific_heat_id = -1;
+
+        // the index for the scalar, vector, and tensor fields
+        size_t var = 0;
+        size_t vector_var = 0;
+        size_t tensor_var = 0;
+
+        // material point state to output
+        for (auto field : SimulationParamaters.output_options.output_mat_pt_state){
+            switch(field){
+                // scalar vars
+                case material_pt_state::density:
+                    mat_elem_scalar_var_names[var] = "mat_den";
+                    mat_den_id = var;
+                    var++;
+                    break;
+                case material_pt_state::pressure:
+                    mat_elem_scalar_var_names[var] = "mat_pres";
+                    mat_pres_id = var;
+                    var++;
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    mat_elem_scalar_var_names[var] = "mat_sie";
+                    mat_sie_id = var;
+                    var++;
+                    break;
+                case material_pt_state::sound_speed:
+                    mat_elem_scalar_var_names[var] = "mat_sspd";
+                    mat_sspd_id = var;
+                    var++;
+                    break;
+                case material_pt_state::mass:
+                    mat_elem_scalar_var_names[var] = "mat_mass";
+                    mat_mass_id = var;
+                    var++;
+                    break;
+                case material_pt_state::volume_fraction:
+                    mat_elem_scalar_var_names[var] = "mat_volfrac";
+                    mat_volfrac_id = var; 
+                    var++;
+
+                    mat_elem_scalar_var_names[var] = "mat_geo_volfrac";
+                    mat_geo_volfrac_id = var; 
+                    var++;
+                    break;
+                case material_pt_state::eroded_flag:
+                    mat_elem_scalar_var_names[var] = "mat_eroded";
+                    mat_eroded_id = var;
+                    var++;
+                    break;
+                // tensor vars
+                case material_pt_state::stress:
+                    mat_elem_tensor_var_names[tensor_var] = "mat_stress";
+                    mat_stress_id = tensor_var;
+                    tensor_var++;
+                    break;
+
+    
+                // additional vars for thermal-mechanical solver
+                case material_pt_state::thermal_conductivity:
+                    mat_elem_scalar_var_names[var] = "mat_thermal_K";
+                    mat_conductivity_id = var;
+                    var++;
+                    break;
+                
+                case material_pt_state::specific_heat:
+                    mat_elem_scalar_var_names[var] = "mat_Cp";
+                    mat_specific_heat_id = var;
+                    var++;
+                    break;
+
+
+                // add other variables here
+
+                // not used
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+            } // end switch
+        } // end for over mat_pt_states
+
+
+        // element average fields to output
+
+        // the ids to access a variable in the elem_scalar_var_name or tensor list
+        int den_id = -1;
+        int pres_id = -1;
+        int sie_id = -1;
+        int sspd_id = -1;
+        int mass_id = -1; 
+        int stress_id = -1;
+
+        int conductivity_id = -1;
+        int specific_heat_id = -1;
+
+        // reset the counters
+        var = 0;
+        vector_var = 0;
+        tensor_var = 0;
+
+        // element state to output
+        for (auto field : SimulationParamaters.output_options.output_elem_state){
+            switch(field){
+                // scalar vars
+                case material_pt_state::density:
+                    elem_scalar_var_names[var] = "den";
+                    den_id = var;
+                    var++;
+                    break;
+                case material_pt_state::pressure:
+                    elem_scalar_var_names[var] = "pres";
+                    pres_id = var;
+                    var++;
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    elem_scalar_var_names[var] = "sie";
+                    sie_id = var;
+                    var++;
+                    break;
+                case material_pt_state::sound_speed:
+                    elem_scalar_var_names[var] = "sspd";
+                    sspd_id = var;
+                    var++;
+                    break;
+                case material_pt_state::mass:
+                    elem_scalar_var_names[var] = "mass";
+                    mass_id = var;
+                    var++;
+                    break;
+                // tensor vars
+                case material_pt_state::stress:
+                    elem_tensor_var_names[tensor_var] = "stress";
+                    stress_id = tensor_var;
+                    tensor_var++;
+                    break;
+
+                // heat transfer variables
+                case material_pt_state::thermal_conductivity:
+                    elem_scalar_var_names[var] = "thermal_K";
+                    conductivity_id = var;
+                    var++;
+                    break;
+                
+                case material_pt_state::specific_heat:
+                    elem_scalar_var_names[var] = "Cp";
+                    specific_heat_id = var;
+                    var++;
+                    break;
+
+                // add other variables here
+
+                // not used
+                case material_pt_state::volume_fraction:
+                    break;
+                case material_pt_state::eroded_flag:
+                    break;
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+            } // end switch
+        } // end for over mat_pt_states
+
+        // append Gauss point vars to the element arrays
+        int vol_id = -1;
+        int div_id = -1;
+        int level_set_id = -1;
+        int vel_grad_id = -1;
+        
+
+        for (auto field : SimulationParamaters.output_options.output_gauss_pt_state){
+            switch(field){
+                // scalars
+                case gauss_pt_state::volume:
+                    elem_scalar_var_names[var] = "vol";
+                    vol_id = var;
+                    var++;
+                    break;
+                case gauss_pt_state::divergence_velocity:
+                    elem_scalar_var_names[var] = "div";
+                    div_id = var;
+                    var++;
+                    break;
+
+                case gauss_pt_state::level_set:
+                    elem_scalar_var_names[var] = "level_set";
+                    level_set_id = var;
+                    var++;
+                    break;
+
+                // tensors
+                case gauss_pt_state::gradient_velocity:
+                    elem_tensor_var_names[tensor_var] = "vel_grad";
+                    vel_grad_id = tensor_var;
+                    tensor_var++;
+                    break;
+            } // end switch
+        } // end loop over gauss_pt_states
+
+
+        // *******************
+        //  nodal values
+        // *******************
+
+        size_t num_node_scalar_vars = 0;
+        size_t num_node_vector_vars = 0;
+
+        for (auto field : SimulationParamaters.output_options.output_node_state){
+            switch(field){
+                // --- scalars
+                case node_state::mass:
+                    num_node_scalar_vars ++;
+                    break;
+                case node_state::temp:
+                    num_node_scalar_vars ++;
+                    break;
+                // -- vectors
+                case node_state::coords:
+                    num_node_vector_vars ++;
+                    break;
+                case node_state::velocity:
+                    num_node_vector_vars ++; // for velocity
+                    num_node_vector_vars ++; // for acceleration
+                    break;
+                case node_state::gradient_level_set:
+                    num_node_vector_vars ++;
+                    break;                    
+                case node_state::force:
+                    break;
+                
+                // heat transer vars
+                case node_state::heat_transfer:
+                    break;
+            } // end switch
+        } // end for over 
+        Kokkos::fence();
+
+
+        // Scalar and vector values associated with a node
+        std::vector<std::string> node_scalar_var_names(num_node_scalar_vars);
+        std::vector<std::string> node_vector_var_names(num_node_vector_vars);
+
+        int node_mass_id = -1;
+        int node_vel_id = -1;
+        int node_accel_id = -1;
+        int node_coord_id = -1;
+        int node_temp_id = -1;
+        int node_grad_level_set_id = -1;
+
+        // reset counters for node fields
+        var = 0;
+        vector_var = 0;
+        tensor_var = 0;
+
+        for (auto field : SimulationParamaters.output_options.output_node_state){
+            switch(field){
+                // scalars
+                case node_state::mass:
+                    node_scalar_var_names[var] = "node_mass";
+                    node_mass_id = var;
+                    var++;
+                    break;
+                case node_state::temp:
+                    node_scalar_var_names[var] = "node_temp";
+                    node_temp_id = var;
+                    var++;
+                    break;
+
+                // vector fields
+
+                case node_state::coords:
+                    node_vector_var_names[vector_var] = "node_coords";
+                    node_coord_id = vector_var;
+                    vector_var++;
+                    break;
+
+                case node_state::velocity:
+                    node_vector_var_names[vector_var] = "node_vel";
+                    node_vel_id = vector_var;
+                    vector_var++;
+
+                    node_vector_var_names[vector_var] = "node_accel";
+                    node_accel_id = vector_var;
+                    vector_var++;
+                    break;
+
+                case node_state::gradient_level_set:
+                    node_vector_var_names[vector_var] = "node_grad_lvlset";
+                    node_grad_level_set_id = vector_var;
+                    vector_var++;
+                    break;
+
+                // -- not used vars
+                case node_state::force:
+                    break;
+
+                // heat transer vars
+                case node_state::heat_transfer:
+                    break;
+
+                // tensors
+
+            } // end switch
+        } // end for over 
+
+
+        // **************************************
+        //  build and save element average fields
+        // **************************************
+
+        // short hand
+        const size_t num_nodes = mesh.num_nodes;
+        const size_t num_elems = mesh.num_elems;
+        const size_t num_dims  = mesh.num_dims;
+        const size_t num_nodes_in_elem = mesh.num_nodes_in_elem;
+        const int Pn_order = mesh.Pn;
+
+        // save the elem state to an array for exporting to graphics files
+        DCArrayKokkos<double> elem_scalar_fields(num_elem_scalar_vars, num_elems, "elem_scalars");
+        DCArrayKokkos<double> elem_tensor_fields(num_elem_tensor_vars, num_elems, 3, 3, "elem_tensors");
+        elem_scalar_fields.set_values(0.0);
+        elem_tensor_fields.set_values(0.0);
+
+
+        // -----------------------------------------------------------------------
+        // save the output fields to a single element average array for all state
+        // -----------------------------------------------------------------------
+        for (int mat_id = 0; mat_id < num_mats; mat_id++) {
+
+            // material point and guass point state are concatenated together
+            concatenate_elem_fields(State.MaterialPoints,
+                                    State.GaussPoints,
+                                    elem_scalar_fields,
+                                    elem_tensor_fields,
+                                    State.MaterialToMeshMaps.elem_in_mat_elem,
+                                    SimulationParamaters.output_options.output_elem_state,
+                                    SimulationParamaters.output_options.output_gauss_pt_state,
+                                    State.MaterialToMeshMaps.num_mat_elems.host(mat_id),
+                                    mat_id,
+                                    num_elems,
+                                    den_id,
+                                    pres_id,
+                                    sie_id,
+                                    sspd_id,
+                                    mass_id,
+                                    stress_id,
+                                    vol_id,
+                                    div_id,
+                                    level_set_id,
+                                    vel_grad_id,
+                                    conductivity_id,
+                                    specific_heat_id);
+        } // end for mats
+
+        // make specific fields for the element average
+        if (sie_id>=0){
+            FOR_ALL(elem_gid, 0, num_elems, {
+                // get sie by dividing by the mass
+                elem_scalar_fields(sie_id, elem_gid) /= (elem_scalar_fields(mass_id, elem_gid)+1.e-20); 
+            });
+        } // end if
+
+        Kokkos::fence();
+        elem_scalar_fields.update_host();
+        elem_tensor_fields.update_host();
+        
+
+        // ************************
+        //  Build the nodal fields 
+        // ************************
+
+        // save the nodal fields to an array for exporting to graphics files
+        DCArrayKokkos<double> node_scalar_fields(num_node_scalar_vars, num_nodes, "node_scalars");
+        DCArrayKokkos<double> node_vector_fields(num_node_vector_vars, num_nodes, 3, "node_tenors");
+      
+        concatenate_nodal_fields(State.node,
+                                 node_scalar_fields,
+                                 node_vector_fields,
+                                 SimulationParamaters.output_options.output_node_state,
+                                 dt,
+                                 num_nodes,
+                                 num_dims,
+                                 node_mass_id,
+                                 node_vel_id,
+                                 node_accel_id,
+                                 node_coord_id,
+                                 node_grad_level_set_id,
+                                 node_temp_id);
+                                 
+
+        Kokkos::fence();
+        node_scalar_fields.update_host();
+        node_vector_fields.update_host();
+
+
+        // ********************************
+        //  Write the nodal and elem fields 
+        // ********************************
+
+        if (SimulationParamaters.output_options.format == output_options::viz ||
+            SimulationParamaters.output_options.format == output_options::viz_and_state) {
+
+            // create the folder structure if it does not exist
+            struct stat st;
+
+            if (stat("vtk", &st) != 0) {
+                int returnCode = system("mkdir vtk");
+
+                if (returnCode == 1) {
+                    std::cout << "Unable to make vtk directory" << std::endl;
+                }
+            }
+            else{
+                if(solver_id==0 && graphics_id==0){
+                    // delete the existing files inside
+                    int returnCode = system("rm vtk/Fierro*");
+                    if (returnCode == 1) {
+                        std::cout << "Unable to clear vtk/Fierro directory" << std::endl;
+                    }
+                }
+            }
+
+            if (stat("vtk/data", &st) != 0) {
+                int returnCode = system("mkdir vtk/data");
+                if (returnCode == 1) {
+                    std::cout << "Unable to make vtk/data directory" << std::endl;
+                }
+            }
+            else{
+                if(solver_id==0 && graphics_id==0){
+                    // delete the existing files inside the folder
+                    int returnCode = system("rm vtk/data/Fierro*");
+                    if (returnCode == 1) {
+                        std::cout << "Unable to clear vtk/data directory" << std::endl;
+                    }
+                }
+            }
+            
+            // call the .vtu writer for element fields
+            std::string elem_fields_name = "fields";
+
+            // make a view of node coords for passing into functions
+            ViewCArray <double> node_coords_host(&State.node.coords.host(0,0), num_nodes, num_dims);
+            ViewCArray <size_t> nodes_in_elem_host(&mesh.nodes_in_elem.host(0,0), num_elems, num_nodes_in_elem);
+
+
+            write_vtu(node_coords_host,
+                      nodes_in_elem_host,
+                      elem_scalar_fields,
+                      elem_tensor_fields,
+                      node_scalar_fields,
+                      node_vector_fields,
+                      elem_scalar_var_names,
+                      elem_tensor_var_names,
+                      node_scalar_var_names,
+                      node_vector_var_names,
+                      elem_fields_name,
+                      graphics_id,
+                      num_nodes,
+                      num_elems,
+                      num_nodes_in_elem,
+                      Pn_order,
+                      num_dims,
+                      solver_id);
+
+
+            // ********************************
+            //  Build and write the mat fields 
+            // ********************************
+
+
+            // note: the file path and folder was created in the elem and node outputs
+            size_t num_mat_files_written = 0;
+            if(num_mat_pt_scalar_vars > 0 || num_mat_pt_tensor_vars >0){
+
+                for (int mat_id = 0; mat_id < num_mats; mat_id++) {
+
+                    const size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
+
+                    // only save material data if the mat lives on the mesh, ie. has state allocated
+                    if (num_mat_elems>0){
+
+                        // set the nodal vars to zero size, we don't write these fields again
+                        node_scalar_var_names.clear();
+                        node_vector_var_names.clear();
+
+                        // the arrays storing all the material field data
+                        DCArrayKokkos<double> mat_elem_scalar_fields(num_mat_pt_scalar_vars, num_mat_elems, "mat_pt_scalars");
+                        DCArrayKokkos<double> mat_elem_tensor_fields(num_mat_pt_tensor_vars, num_mat_elems, 3, 3, "mat_pt_tensors");
+
+
+                        // concatenate material fields into a single array
+                        concatenate_mat_fields(State.MaterialPoints,
+                                               mat_elem_scalar_fields,
+                                               mat_elem_tensor_fields,
+                                               State.MaterialToMeshMaps.elem_in_mat_elem,
+                                               SimulationParamaters.output_options.output_mat_pt_state,
+                                               num_mat_elems,
+                                               mat_id,
+                                               mat_den_id,
+                                               mat_pres_id,
+                                               mat_sie_id,
+                                               mat_sspd_id,
+                                               mat_mass_id,
+                                               mat_volfrac_id,
+                                               mat_geo_volfrac_id,  
+                                               mat_eroded_id,
+                                               mat_stress_id,
+                                               mat_conductivity_id,
+                                               mat_specific_heat_id);
+                        Kokkos::fence();
+                        mat_elem_scalar_fields.update_host();
+                        mat_elem_tensor_fields.update_host();
+
+
+                        std::string str_mat_val = std::to_string(mat_id);                       
+                        std::string mat_fields_name = "mat";
+                        mat_fields_name += str_mat_val;  // add the mat number
+
+                        // save the nodes belonging to this part (i.e., the material)
+                        DCArrayKokkos <double> mat_node_coords(num_nodes,num_dims, "mat_node_coords");
+                        DCArrayKokkos <size_t> mat_nodes_in_mat_elem(num_mat_elems, num_nodes_in_elem, "mat_nodes_in_mat_elem");
+
+                        // the number of actual nodes belonging to the part (i.e., the material)
+                        size_t num_mat_nodes = 0;
+
+                        // build a unique mesh (element and nodes) for the material (i.e., the part)
+                        build_material_elem_node_lists(mesh,
+                                                       State.node.coords,
+                                                       mat_node_coords,
+                                                       mat_nodes_in_mat_elem,
+                                                       State.MaterialToMeshMaps.elem_in_mat_elem,
+                                                       mat_id,
+                                                       num_mat_nodes,
+                                                       num_mat_elems,
+                                                       num_nodes_in_elem,
+                                                       num_dims);
+
+                        ViewCArray <double> mat_node_coords_host(&mat_node_coords.host(0,0), num_mat_nodes, num_dims);
+                        ViewCArray <size_t> mat_nodes_in_elem_host(&mat_nodes_in_mat_elem.host(0,0), num_mat_elems, num_nodes_in_elem);
+                        
+                        // write out a vtu file this 
+                        write_vtu(mat_node_coords_host,
+                                  mat_nodes_in_elem_host,
+                                  mat_elem_scalar_fields,
+                                  mat_elem_tensor_fields,
+                                  node_scalar_fields,
+                                  node_vector_fields,
+                                  mat_elem_scalar_var_names,
+                                  mat_elem_tensor_var_names,
+                                  node_scalar_var_names,
+                                  node_vector_var_names,
+                                  mat_fields_name,
+                                  graphics_id,
+                                  num_mat_nodes,
+                                  num_mat_elems,
+                                  num_nodes_in_elem,
+                                  Pn_order,
+                                  num_dims,
+                                  solver_id);
+
+
+                        num_mat_files_written++;
+
+                    } // end for mat_id
+
+                } // end if material is on the mesh
+
+            } // end if mat variables are to be written
+
+
+            // *************************************************
+            //  write Paraview files to open the graphics files
+            // *************************************************
+
+            // save the graphics time
+            graphics_times(graphics_id) = time_value;
+
+            // check to see if an mesh state was written 
+            bool write_mesh_state = false;
+            if( num_elem_scalar_vars > 0 ||
+                num_elem_tensor_vars > 0 ||
+                num_node_scalar_vars > 0 ||
+                num_node_vector_vars > 0)
+            {
+                write_mesh_state = true;
+            }
+
+            // check to see if a mat state was written
+            bool write_mat_pt_state = false;
+            if( num_mat_pt_scalar_vars > 0 ||
+                num_mat_pt_tensor_vars > 0)
+            {
+                 write_mat_pt_state = true;
+            }
+
+            // call the vtm file writer
+            std::string mat_fields_name = "mat";
+            write_vtm(graphics_times,
+                      elem_fields_name,
+                      mat_fields_name,
+                      time_value,
+                      graphics_id,
+                      num_mat_files_written,
+                      write_mesh_state,
+                      write_mat_pt_state,
+                      solver_id);
+
+            // call the pvd file writer
+            write_pvd(graphics_times,
+                      time_value,
+                      graphics_id,
+                      solver_id);
+
+
+            // increment graphics id counter
+            graphics_id++; // this is private variable in the class
+
+        } // end if viz paraview output is to be written
+
+
+        // STATE
+        if (SimulationParamaters.output_options.format == output_options::state ||
+            SimulationParamaters.output_options.format == output_options::viz_and_state) {
+
+            write_material_point_state(mesh,
+                                       State,
+                                       SimulationParamaters,
+                                       time_value,
+                                       graphics_times,
+                                       node_states,
+                                       gauss_pt_states,
+                                       material_pt_states);
+
+        } // end if state is to be written
+
+
+        // will drop ensight outputs in the near future
+        if (SimulationParamaters.output_options.format == output_options::ensight){
+           write_ensight(mesh,
+                         State,
+                         SimulationParamaters,
+                         dt,
+                         time_value,
+                         graphics_times,
+                         node_states,
+                         gauss_pt_states,
+                         material_pt_states);
+        }
+
+        return;
+
+    } // end write_mesh
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_ensight
+    ///
+    /// \brief Writes an ensight output file
+    ///
+    /// \param Simulation mesh
+    /// \param State data
+    /// \param Simulation parameters
+    /// \param current time value
+    /// \param Vector of all graphics output times
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_ensight(Mesh_t& mesh,
+        State_t& State,
+        SimulationParameters_t& SimulationParamaters,
+        double dt,
+        double time_value,
+        CArray<double> graphics_times,
+        std::vector<node_state> node_states,
+        std::vector<gauss_pt_state> gauss_pt_states,
+        std::vector<material_pt_state> material_pt_states)
+    {
+        size_t num_mats = State.MaterialPoints.num_material_points.size();
+
+        // ---- Update host data ----
+
+        // material point values
+        State.MaterialPoints.den.update_host();
+        State.MaterialPoints.pres.update_host();
+        State.MaterialPoints.stress.update_host();
+        State.MaterialPoints.sspd.update_host();
+        State.MaterialPoints.sie.update_host();
+        State.MaterialPoints.mass.update_host();
+        State.MaterialPoints.eroded.update_host();
+
+
+        // gauss point values
+        State.GaussPoints.vol.update_host();
+
+        // nodal values
+        State.node.coords.update_host();
+        State.node.vel.update_host();
+        State.node.mass.update_host();
+
+        Kokkos::fence();
+
+        // --------------------------
+
+        const int num_scalar_vars = 10;
+        const int num_vec_vars    = 3;
+
+        std::string name_tmp;
+        name_tmp = "Outputs_SGH";
+
+        char* name = new char [name_tmp.length() + 1];
+        std::strcpy(name, name_tmp.c_str());
+
+        const char scalar_var_names[num_scalar_vars][15] = {
+            "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch", "eroded"
+        };
+
+        const char vec_var_names[num_vec_vars][15] = {
+            "pos", "vel", "accel"
+        };
+
+        // short hand
+        const size_t num_nodes = mesh.num_nodes;
+        const size_t num_elems = mesh.num_elems;
+        const size_t num_dims  = mesh.num_dims;
+
+        // save the cell state to an array for exporting to graphics files
+        auto elem_fields = CArray<double>(num_elems, num_scalar_vars);
+        int  elem_switch = 1;
+
+
+        DCArrayKokkos<double> speed(num_elems, "speed");
+        FOR_ALL(elem_gid, 0, num_elems, {
+            double elem_vel[3]; // note:initialization with a list won't work
+            elem_vel[0] = 0.0;
+            elem_vel[1] = 0.0;
+            elem_vel[2] = 0.0;
+            // get the coordinates of the element center
+            for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
+                elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0);
+                elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1);
+                if (mesh.num_dims == 3) {
+                    elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2);
+                }
+                else{
+                    elem_vel[2] = 0.0;
+                }
+            } // end loop over nodes in element
+            elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem;
+            elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem;
+            elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem;
+
+            double speed_sqrd = 0.0;
+            for (int dim = 0; dim < num_dims; dim++) {
+                speed_sqrd += elem_vel[dim] * elem_vel[dim];
+            }
+            speed(elem_gid) = sqrt(speed_sqrd);
+        }); // end parallel for
+        speed.update_host();
+        Kokkos::fence();
+
+        // save the output scale fields to a single 2D array
+
+        // export material centeric data to the elements
+        for (int mat_id = 0; mat_id < num_mats; mat_id++) {
+            size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
+
+            for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) {
+                // 1 material per element
+
+                // get elem gid
+                size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid);
+
+                // save outputs
+                elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id, mat_elem_sid);
+                elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid);
+                elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid);
+                // 3 is guass point vol
+                elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid);
+                elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid);
+                // 6 is elem speed
+                elem_fields(elem_gid, 7) = (double)mat_id;
+                // 8 is the e_switch
+                elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid);
+            } // end for mat elems storage
+        } // end parallel loop over materials
+
+        // export element centric data
+        double e_switch = 1;
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid);
+            elem_fields(elem_gid, 6) = speed.host(elem_gid);
+            elem_fields(elem_gid, 8) = e_switch;
+            elem_switch *= -1;
+        } // end for elem_gid
+
+        // save the vertex vector fields to an array for exporting to graphics files
+        CArray<double> vec_fields(num_nodes, num_vec_vars, 3);
+
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            // position, var 0
+            vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0);
+            vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1);
+            if (num_dims == 2) {
+                vec_fields(node_gid, 0, 2) = 0.0;
+            }
+            else{
+                vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2);
+            }
+
+            // velocity, var 1
+            vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0);
+            vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1);
+            if (num_dims == 2) {
+                vec_fields(node_gid, 1, 2) = 0.0;
+            }
+            else{
+                vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2);
+            }
+
+            // accelleration, var 2
+            vec_fields(node_gid, 2, 0) = (State.node.vel.host(node_gid, 0) - State.node.vel_n0.host(node_gid, 0))/dt;
+            vec_fields(node_gid, 2, 1) = (State.node.vel.host(node_gid, 1) - State.node.vel_n0.host(node_gid, 1))/dt;
+            if (num_dims == 2) {
+                vec_fields(node_gid, 2, 2) = 0.0;
+            }
+            else{
+                vec_fields(node_gid, 2, 2) = (State.node.vel.host(node_gid, 2) - State.node.vel_n0.host(node_gid, 2))/dt;
+            }
+
+
+        } // end for loop over vertices
+
+
+        //  ---------------------------------------------------------------------------
+        //  Setup of file and directoring for exporting
+        //  ---------------------------------------------------------------------------
+        FILE* out[20];   // the output files that are written to
+        char  filename[128];
+        int   max_len = sizeof filename;
+        int   str_output_len;
+
+        struct stat st;
+
+        if (stat("ensight", &st) != 0) {
+            system("mkdir ensight");
+        }
+
+        if (stat("ensight/data", &st) != 0) {
+            system("mkdir ensight/data");
+        }
+
+        //  ---------------------------------------------------------------------------
+        //  Write the Geometry file
+        //  ---------------------------------------------------------------------------
+        // sprintf(filename, "ensight/data/%s.%05d.geo", name, graphics_id);
+        str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.geo", name, graphics_id);
+        // filename has the full string
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "A graphics dump by Fierro \n");
+
+        fprintf(out[0], "%s", "EnSight Gold geometry\n");
+        fprintf(out[0], "%s", "node id assign\n");
+        fprintf(out[0], "%s", "element id assign\n");
+
+        fprintf(out[0], "part\n");
+        fprintf(out[0], "%10d\n", 1);
+        fprintf(out[0], "Mesh\n");
+
+        // --- vertices ---
+        fprintf(out[0], "coordinates\n");
+        fprintf(out[0], "%10lu\n", num_nodes);
+
+        // write all components of the point coordinates
+        for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 0));
+        }
+
+        for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 1));
+        }
+
+        for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
+            if (num_dims == 3) {
+                fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 2));
+            }
+            else{
+                fprintf(out[0], "%12.5e\n", 0.0);
+            }
+        }
+
+        // --- elements ---
+        if (num_dims == 3) {
+            fprintf(out[0], "hexa8\n");
+        }
+        else{
+            fprintf(out[0], "quad4\n");
+        }
+        fprintf(out[0], "%10lu\n", num_elems);
+
+
+        int convert_ijk_to_ensight[8];
+        if(mesh.num_dims==3){
+            convert_ijk_to_ensight[0] = 0;
+            convert_ijk_to_ensight[1] = 1;
+            convert_ijk_to_ensight[2] = 3;
+            convert_ijk_to_ensight[3] = 2;
+            convert_ijk_to_ensight[4] = 4;
+            convert_ijk_to_ensight[5] = 5;
+            convert_ijk_to_ensight[6] = 7;
+            convert_ijk_to_ensight[7] = 6;
+        }
+        else{
+        
+            convert_ijk_to_ensight[0] = 0;
+            convert_ijk_to_ensight[1] = 1;
+            convert_ijk_to_ensight[2] = 2;
+            convert_ijk_to_ensight[3] = 3;
+            convert_ijk_to_ensight[4] = 4;
+            convert_ijk_to_ensight[5] = 5;
+            convert_ijk_to_ensight[6] = 6;
+            convert_ijk_to_ensight[7] = 7;
+        } // end if
+
+
+        // write all global point numbers for this cell
+        for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
+                fprintf(out[0], "%10lu\t", mesh.nodes_in_elem.host(elem_gid, convert_ijk_to_ensight[node_lid]) + 1); // note: node_gid starts at 1
+            }
+            fprintf(out[0], "\n");
+        }
+
+        fclose(out[0]);
+
+        // ---------------------------------------------------------------------------
+        // Write the Scalar variable files
+        // ---------------------------------------------------------------------------
+
+        // ensight_vars = (den, pres,...)
+        for (int var = 0; var < num_scalar_vars; var++) {
+            // write a scalar value
+            // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]);
+            str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]);
+            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+            out[0] = fopen(filename, "w");
+
+            fprintf(out[0], "Per_elem scalar values\n");
+            fprintf(out[0], "part\n");
+            fprintf(out[0], "%10d\n", 1);
+            if (num_dims == 3) {
+                fprintf(out[0], "hexa8\n");
+            }
+            else{
+                fprintf(out[0], "quad4\n");
+            }
+
+            for (int elem_id = 0; elem_id < num_elems; elem_id++) {
+                fprintf(out[0], "%12.5e\n", elem_fields(elem_id, var));
+            }
+
+            fclose(out[0]);
+        } // end for var
+
+        //  ---------------------------------------------------------------------------
+        //  Write the Vector variable files
+        //  ---------------------------------------------------------------------------
+
+        // ensight vector vars = (position, velocity, force)
+        for (int var = 0; var < num_vec_vars; var++) {
+            // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
+            str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
+            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+            out[0] = fopen(filename, "w");
+            // fprintf(out[0],"Per_node vector values\n");
+            // fprintf(out[0],"part\n");
+            // fprintf(out[0],"%10d \n",1);
+            // fprintf(out[0],"hexa8\n"); // WARNING, maybe bug here?
+
+            fprintf(out[0], "Per_node vector values\n");
+            fprintf(out[0], "part\n");
+            fprintf(out[0], "%10d\n", 1);
+            fprintf(out[0], "block\n");
+
+            for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
+                fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 0));
+            }
+
+            for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
+                fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 1));
+            }
+
+            for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
+                fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 2));
+            }
+
+            fclose(out[0]);
+        } // end for var
+
+        // ---------------------------------------------------------------------------
+        // Write the case file
+        // ---------------------------------------------------------------------------
+
+        // sprintf(filename, "ensight/%s.case", name);
+        str_output_len = snprintf(filename, max_len, "ensight/%s.case", name);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "FORMAT\n");
+        fprintf(out[0], "type: ensight gold\n");
+        fprintf(out[0], "GEOMETRY\n");
+
+        // sprintf(filename, "model: data/%s.*****.geo\n", name);
+        str_output_len = snprintf(filename, max_len, "model: data/%s.*****.geo\n", name);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+        fprintf(out[0], "%s", filename);
+        fprintf(out[0], "VARIABLE\n");
+
+        for (int var = 0; var < num_scalar_vars; var++) {
+            // sprintf(filename, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]);
+            str_output_len = snprintf(filename, max_len, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]);
+            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+            fprintf(out[0], "%s", filename);
+        }
+
+        for (int var = 0; var < num_vec_vars; var++) {
+            // sprintf(filename, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]);
+            str_output_len = snprintf(filename, max_len, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]);
+            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+            fprintf(out[0], "%s", filename);
+        }
+
+        fprintf(out[0], "TIME\n");
+        fprintf(out[0], "time set: 1\n");
+        fprintf(out[0], "number of steps: %4d\n", graphics_id + 1);
+        fprintf(out[0], "filename start number: 0\n");
+        fprintf(out[0], "filename increment: 1\n");
+        fprintf(out[0], "time values: \n");
+
+        graphics_times(graphics_id) = time_value;
+
+        for (int i = 0; i <= graphics_id; i++) {
+            fprintf(out[0], "%12.5e\n", graphics_times(i));
+        }
+        fclose(out[0]);
+
+        // ---------------------------------------------------------------------------
+        // Done writing the graphics dump
+        // ---------------------------------------------------------------------------
+
+        // increment graphics id counter
+        graphics_id++;
+
+        delete[] name;
+
+
+        return;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_vtk_old
+    ///
+    /// \brief Writes a vtk output file
+    ///
+    /// \param Simulation mesh
+    /// \param State data
+    /// \param Simulation parameters
+    /// \param current time value
+    /// \param Vector of all graphics output times
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_vtk_old(Mesh_t& mesh,
+        State_t& State,
+        SimulationParameters_t& SimulationParamaters,
+        double dt,
+        double time_value,
+        CArray<double> graphics_times,
+        std::vector<node_state> node_states,
+        std::vector<gauss_pt_state> gauss_pt_states,
+        std::vector<material_pt_state> material_pt_states)
+    {
+
+        size_t num_mats = State.MaterialPoints.num_material_points.size();
+
+        // ---- Update host data ----
+
+        // material point values
+        State.MaterialPoints.den.update_host();
+        State.MaterialPoints.pres.update_host();
+        State.MaterialPoints.stress.update_host();
+        State.MaterialPoints.sspd.update_host();
+        State.MaterialPoints.sie.update_host();
+        State.MaterialPoints.mass.update_host();
+        State.MaterialPoints.conductivity.update_host();
+        State.MaterialPoints.temp_grad.update_host();
+        State.MaterialPoints.eroded.update_host();
+
+
+        // gauss point values
+        State.GaussPoints.vol.update_host();
+
+        // nodal values
+        State.node.coords.update_host();
+        State.node.vel.update_host();
+        State.node.mass.update_host();
+        State.node.temp.update_host();
+
+        Kokkos::fence();
+
+
+        const int num_cell_scalar_vars = 13;
+        const int num_cell_vec_vars    = 0;
+        const int num_cell_tensor_vars = 0;
+
+        const int num_point_scalar_vars = 1;
+        const int num_point_vec_vars = 2;
+
+
+        // Scalar values associated with a cell
+        const char cell_scalar_var_names[num_cell_scalar_vars][15] = {
+            "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch","eroded", "temp_grad_x", "temp_grad_y", "temp_grad_z"
+        };
+        
+        const char cell_vec_var_names[num_cell_vec_vars][15] = {
+            
+        };
+
+        const char point_scalar_var_names[num_point_scalar_vars][15] = {
+            "temp"
+        };
+
+        const char point_vec_var_names[num_point_vec_vars][15] = {
+            "pos", "vel" 
+        };
+
+        // short hand
+        const size_t num_nodes = mesh.num_nodes;
+        const size_t num_elems = mesh.num_elems;
+        const size_t num_dims  = mesh.num_dims;
+
+        // save the cell state to an array for exporting to graphics files
+        auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
+        int  elem_switch = 1;
+
+        DCArrayKokkos<double> speed(num_elems, "speed");
+        FOR_ALL(elem_gid, 0, num_elems, {
+            double elem_vel[3]; // note:initialization with a list won't work
+            elem_vel[0] = 0.0;
+            elem_vel[1] = 0.0;
+            elem_vel[2] = 0.0;
+            // get the coordinates of the element center
+            for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
+                elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0);
+                elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1);
+                if (mesh.num_dims == 3) {
+                    elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2);
+                }
+                else{
+                    elem_vel[2] = 0.0;
+                }
+            } // end loop over nodes in element
+            elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem;
+            elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem;
+            elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem;
+
+            double speed_sqrd = 0.0;
+            for (int dim = 0; dim < num_dims; dim++) {
+                speed_sqrd += elem_vel[dim] * elem_vel[dim];
+            }
+            speed(elem_gid) = sqrt(speed_sqrd);
+        }); // end parallel for
+        speed.update_host();
+        Kokkos::fence();
+
+        // save the output scale fields to a single 2D array
+
+
+        // export material centeric data to the elements
+        for (int mat_id = 0; mat_id < num_mats; mat_id++) {
+            size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
+
+            for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) {
+                // 1 material per element
+
+                // get elem gid
+                size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid);
+
+                // save outputs
+                elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id,mat_elem_sid);
+                elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid);
+                elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid);
+                // 3 is guass point vol
+                elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid);
+                elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid);
+                // 6 is elem speed
+                elem_fields(elem_gid, 7) = (double)mat_id;
+                // 8 is the e_switch
+                elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid);
+                elem_fields(elem_gid, 10) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,0);
+                elem_fields(elem_gid, 11) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,1);
+                elem_fields(elem_gid, 12) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,2);
+            } // end for mat elems storage
+        } // end parallel loop over materials
+
+        // export element centric data
+        double e_switch = 1;
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid);
+            elem_fields(elem_gid, 6) = speed.host(elem_gid);
+            elem_fields(elem_gid, 8) = State.GaussPoints.div.host(elem_gid);
+            elem_switch *= -1;
+        } // end for elem_gid
+
+        // save the vertex vector fields to an array for exporting to graphics files
+        CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
+        CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
+
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            // position, var 0
+            vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0);
+            vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1);
+            if (num_dims == 2) {
+                vec_fields(node_gid, 0, 2) = 0.0;
+            }
+            else{
+                vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2);
+            }
+
+            // position, var 1
+            vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0);
+            vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1);
+            if (num_dims == 2) {
+                vec_fields(node_gid, 1, 2) = 0.0;
+            }
+            else{
+                vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2);
+            }
+
+            point_scalar_fields(node_gid, 0) = State.node.temp.host(node_gid);
+        } // end for loop over vertices
+
+
+        FILE* out[20];   // the output files that are written to
+        char  filename[100]; // char string
+        int   max_len = sizeof filename;
+        int   str_output_len;
+
+        struct stat st;
+
+        if (stat("vtk", &st) != 0) {
+            system("mkdir vtk");
+        }
+
+        // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
+
+        //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id);  // mesh file
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.vtk", graphics_id);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+         // mesh file
+        
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "# vtk DataFile Version 2.0\n");  // part 2
+        fprintf(out[0], "Mesh for Fierro\n");             // part 2
+        fprintf(out[0], "ASCII \n");                      // part 3
+        fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4
+
+        fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes);
+
+        // write all components of the point coordinates
+        for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+            fprintf(out[0],
+                    "%f %f %f\n",
+                    State.node.coords.host(node_gid, 0),
+                    State.node.coords.host(node_gid, 1),
+                    State.node.coords.host(node_gid, 2));
+        } // end for
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the elems
+        ---------------------------------------------------------------------------
+        */
+
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem);  // size=all printed values
+
+        int Pn_order   = mesh.Pn;
+        int order[3]   = { Pn_order, Pn_order, Pn_order };
+
+        // const int num_1D_points = Pn_order+1;
+
+        // write all global point numbers for this elem
+        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+            fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem
+
+            for (int k = 0; k <= Pn_order; k++) {
+                for (int j = 0; j <= Pn_order; j++) {
+                    for (int i = 0; i <= Pn_order; i++) {
+                        size_t node_lid = PointIndexFromIJK(i, j, k, order);
+                        fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid));
+                    }
+                }
+            }
+
+            fprintf(out[0], "\n");
+        } // end for
+
+        // Write the element types
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems);
+        // VTK_LAGRANGE_HEXAHEDRON: 72,
+        // VTK_HIGHER_ORDER_HEXAHEDRON: 67
+        // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
+        // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
+        // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
+        // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
+        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+            fprintf(out[0], "%d \n", 72);
+        }
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the nodal vector variables to file
+        ---------------------------------------------------------------------------
+        */
+
+        fprintf(out[0], "\n");
+        fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes);
+
+        // vtk vector vars = (position, velocity)
+        for (int var = 0; var < num_point_vec_vars; var++) {
+            fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]);
+            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+                fprintf(out[0], "%f %f %f\n",
+                        vec_fields(node_gid, var, 0),
+                        vec_fields(node_gid, var, 1),
+                        vec_fields(node_gid, var, 2));
+            } // end for nodes
+        } // end for vec_vars
+
+
+        // vtk scalar vars = (temp)
+        for (int var = 0; var < num_point_scalar_vars; var++) {
+            fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]);
+            fprintf(out[0], "LOOKUP_TABLE default\n");
+            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+                fprintf(out[0], "%f\n",
+                        point_scalar_fields(node_gid, 0));
+            } // end for nodes
+        } // end for vec_vars
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the scalar elem variable to file
+        ---------------------------------------------------------------------------
+        */
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems);
+
+        for (int var = 0; var < num_cell_scalar_vars; var++) {
+            fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4]
+            fprintf(out[0], "LOOKUP_TABLE default\n");
+            for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+                fprintf(out[0], "%f\n", elem_fields(elem_gid, var));
+            } // end for elem
+        } // end for cell scalar_vars
+
+        fclose(out[0]);
+
+        graphics_times(graphics_id) = time_value;
+
+        // Write time series metadata
+        //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id);  // mesh file
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); 
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+        // mesh file
+
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "{\n");
+        fprintf(out[0], "  \"file-series-version\" : \"1.0\",\n");
+        fprintf(out[0], "  \"files\" : [\n");
+
+        for (int i = 0; i <= graphics_id; i++) {
+            fprintf(out[0], "    { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) );
+        }
+
+        // fprintf(out[0], "%12.5e\n", graphics_times(i));
+        fprintf(out[0], "  ]\n"); // part 4
+        fprintf(out[0], "}"); // part 4
+
+        fclose(out[0]);
+
+        // increment graphics id counter
+        graphics_id++;
+
+
+    } // end write vtk old
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn concatenate_elem_fields
+    ///
+    /// \brief A function to calculate the average of elem fields and concatentate into 1 array
+    ///
+    ///
+    /// \param MaterialPoints a struct containing the material point state arrays
+    /// \param elem_scalar_fields the scalar fields
+    /// \param elem_tensor_fields the tensor fields
+    /// \param elem_in_mat_elem a listing of the element ids the material resides in
+    /// \param output_elem_state a std::vector of enums specifying the elem avg outputs
+    /// \param num_mat_elems the number of elements the material resides in
+    /// \param mat_id the index for the material
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void concatenate_elem_fields(const MaterialPoint_t& MaterialPoints,
+                                 const GaussPoint_t& GaussPoints,
+                                 DCArrayKokkos<double>& elem_scalar_fields,
+                                 DCArrayKokkos<double>& elem_tensor_fields,
+                                 const DRaggedRightArrayKokkos<size_t>& elem_in_mat_elem,
+                                 const std::vector<material_pt_state>& output_elem_state,
+                                 const std::vector<gauss_pt_state>& output_gauss_pt_states,
+                                 const size_t num_mat_elems,
+                                 const size_t mat_id,
+                                 const size_t num_elems,
+                                 const int den_id,
+                                 const int pres_id,
+                                 const int sie_id,
+                                 const int sspd_id,
+                                 const int mass_id,
+                                 const int stress_id,
+                                 const int vol_id,
+                                 const int div_id,
+                                 const int level_set_id,
+                                 const int vel_grad_id,
+                                 const int conductivity_id,
+                                 const int specific_heat_id)
+    {
+
+        // --- loop over the material point states
+
+        for (auto field : output_elem_state){
+            switch(field){
+                // scalar vars
+                case material_pt_state::density:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        elem_scalar_fields(den_id, elem_gid) += MaterialPoints.den(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::pressure:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        elem_scalar_fields(pres_id, elem_gid) += MaterialPoints.pres(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        // extensive ie here, but after this function, it will become specific ie
+                        elem_scalar_fields(sie_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.sie(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::sound_speed:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        elem_scalar_fields(sspd_id, elem_gid) += MaterialPoints.sspd(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
+                                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::mass:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        elem_scalar_fields(mass_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid);
+                    });
+                    break;
+                // ---------------    
+                // tensor vars
+                // ---------------
+                case material_pt_state::stress:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        // average tensor fields, it is always 3D
+                        // note: paraview is row-major, CArray convention
+                        for (size_t i=0; i<3; i++){
+                            for(size_t j=0; j<3; j++){
+
+                                // stress tensor 
+                                elem_tensor_fields(stress_id, elem_gid, i, j) +=
+                                                MaterialPoints.stress(mat_id, mat_elem_sid,i,j) *
+                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
+                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                            } // end for
+                        } // end for
+                    });
+                    break;
+
+                // thermal solver vars
+                case material_pt_state::thermal_conductivity:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        elem_scalar_fields(conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid)*
+                                                                             MaterialPoints.volfrac(mat_id, mat_elem_sid)*
+                                                                             MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                    });
+                    break;
+
+                case material_pt_state::specific_heat:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        elem_scalar_fields(specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid)*
+                                                                              MaterialPoints.volfrac(mat_id, mat_elem_sid)*
+                                                                              MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                    });
+                    break;
+
+
+                // add other variables here
+
+                // not used variables
+                case material_pt_state::volume_fraction:
+                    break;
+                case material_pt_state::eroded_flag:
+                    break;
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+            } // end switch
+        }// end for over mat point state
+
+        
+        // --- add loop over gauss points ---
+
+        // export element centric data
+        for (auto field : output_gauss_pt_states){
+            switch(field){
+                // scalars
+                case gauss_pt_state::volume:
+
+                    FOR_ALL(elem_gid, 0, num_elems, {
+                        elem_scalar_fields(vol_id, elem_gid) = GaussPoints.vol(elem_gid);
+                    });
+
+                    break;
+                case gauss_pt_state::divergence_velocity:
+
+                    FOR_ALL(elem_gid, 0, num_elems, {
+                        elem_scalar_fields(div_id, elem_gid) = GaussPoints.div(elem_gid);
+                    });
+
+                    break;
+
+                case gauss_pt_state::level_set:
+
+                    FOR_ALL(elem_gid, 0, num_elems, {
+                        elem_scalar_fields(level_set_id, elem_gid) = GaussPoints.level_set(elem_gid);
+                    });
+
+                    break;
+
+                // tensors
+                case gauss_pt_state::gradient_velocity:
+                    // note: paraview is row-major, CArray convention
+                    FOR_ALL(elem_gid, 0, num_elems, {
+                        for (size_t i=0; i<3; i++){
+                            for(size_t j=0; j<3; j++){
+                                elem_tensor_fields(vel_grad_id, elem_gid, i, j) = 
+                                                    GaussPoints.vel_grad(elem_gid, i, j);
+                            }
+                        } // end for
+                    });
+
+                    break;
+
+                // add other gauss variables here
+
+            } // end switch
+        } // end loop over gauss_pt_states
+
+
+        // --- add end gauss point loop --
+
+    } // end of function
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn concatenate_mat_fields
+    ///
+    /// \brief A function to concatentate material fields into 1 array
+    ///
+    ///
+    /// \param MaterialPoints a struct containing the material point state arrays
+    /// \param elem_scalar_fields the scalar fields
+    /// \param elem_tensor_fields the tensor fields
+    /// \param elem_in_mat_elem a listing of the element ids the material resides in
+    /// \param output_material_pt_states a std::vector of enums specifying the model
+    /// \param num_mat_elems the number of elements the material resides in
+    /// \param mat_id the index for the material
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void concatenate_mat_fields(const MaterialPoint_t& MaterialPoints,
+                                DCArrayKokkos<double>& mat_elem_scalar_fields,
+                                DCArrayKokkos<double>& mat_elem_tensor_fields,
+                                const DRaggedRightArrayKokkos<size_t>& elem_in_mat_elem,
+                                const std::vector<material_pt_state>& output_material_pt_states,
+                                const size_t num_mat_elems,
+                                const size_t mat_id,
+                                const int mat_den_id,
+                                const int mat_pres_id,
+                                const int mat_sie_id,
+                                const int mat_sspd_id,
+                                const int mat_mass_id,
+                                const int mat_volfrac_id,  
+                                const int mat_geo_volfrac_id,  
+                                const int mat_eroded_id,
+                                const int mat_stress_id,
+                                const int mat_conductivity_id,
+                                const int mat_specific_heat_id)
+    {
+      
+        // --- loop over the material point states
+
+        for (auto field : output_material_pt_states){
+            switch(field){
+                // scalar vars
+                case material_pt_state::density:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        mat_elem_scalar_fields(mat_den_id, mat_elem_sid) = MaterialPoints.den(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::pressure:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        mat_elem_scalar_fields(mat_pres_id, mat_elem_sid) = MaterialPoints.pres(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::specific_internal_energy:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        // extensive ie here, but after this function, it will become specific ie
+                        mat_elem_scalar_fields(mat_sie_id, mat_elem_sid) = MaterialPoints.sie(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::sound_speed:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        mat_elem_scalar_fields(mat_sspd_id, mat_elem_sid) = MaterialPoints.sspd(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::mass:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        mat_elem_scalar_fields(mat_mass_id, mat_elem_sid) = MaterialPoints.mass(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::volume_fraction:
+                    // material volume fraction
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        // this is the volume fraction of a material within a part
+                        mat_elem_scalar_fields(mat_volfrac_id, mat_elem_sid) = MaterialPoints.volfrac(mat_id, mat_elem_sid);
+                    });
+
+                    // geometric volume fraction
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        // this is the geometric volume fraction (interface reconstruction)
+                        mat_elem_scalar_fields(mat_geo_volfrac_id, mat_elem_sid) = MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
+                    });
+                    break;
+                case material_pt_state::eroded_flag:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        mat_elem_scalar_fields(mat_eroded_id, mat_elem_sid) = (double)MaterialPoints.eroded(mat_id, mat_elem_sid);
+                    });
+                    break;
+                // ---------------    
+                // tensor vars
+                // ---------------
+                case material_pt_state::stress:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // field
+                        // average tensor fields, it is always 3D
+                        // note: paraview is row-major, CArray convention
+                        for (size_t i=0; i<3; i++){
+                            for(size_t j=0; j<3; j++){
+
+                                // stress tensor 
+                                mat_elem_tensor_fields(mat_stress_id, mat_elem_sid, i, j) =
+                                                MaterialPoints.stress(mat_id, mat_elem_sid,i,j);
+                            } // end for
+                        } // end for
+                    });
+                    break;
+
+                // thermal solver vars
+                case material_pt_state::thermal_conductivity:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        mat_elem_scalar_fields(mat_conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid);
+                    });
+                    break;
+
+                case material_pt_state::specific_heat:
+                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
+
+                        // get elem gid
+                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+
+                        // field
+                        mat_elem_scalar_fields(mat_specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid);
+                    });
+                    break;
+
+                // add other variables here
+
+                // not used variables
+                case material_pt_state::elastic_modulii:
+                    break;
+                case material_pt_state::shear_modulii:
+                    break;
+                case material_pt_state::poisson_ratios:
+                    break;
+                case material_pt_state::heat_flux:
+                    break;
+            } // end switch
+        }// end for over mat point state
+
+
+    } // end of function
+    
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn concatenate_nodal_fields
+    ///
+    /// \brief A function to calculate the average of elem fields
+    ///
+    ///
+    /// \param Node a struct containing the material point state arrays
+    /// \param elem_scalar_fields the scalar fields
+    /// \param elem_tensor_fields the tensor fields
+    /// \param elem_in_mat_elem a listing of the element ids the material resides in
+    /// \param output_node_states a std::vector of enums specifying the model
+    /// \param num_mat_elems the number of elements the material resides in
+    /// \param mat_id the index for the material
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void concatenate_nodal_fields(const node_t& Node,
+                                  DCArrayKokkos<double>& node_scalar_fields,
+                                  DCArrayKokkos<double>& node_vector_fields,
+                                  std::vector<node_state>& output_node_states,
+                                  double dt,
+                                  const size_t num_nodes,
+                                  const size_t num_dims,
+                                  const int node_mass_id,
+                                  const int node_vel_id,
+                                  const int node_accel_id,
+                                  const int node_coord_id,
+                                  const int node_grad_level_set_id,
+                                  const int node_temp_id)
+    {
+        for (auto field : output_node_states){
+            switch(field){
+                // scalars
+                case node_state::mass:
+
+                    FOR_ALL(node_gid, 0, num_nodes, {
+                        node_scalar_fields(node_mass_id, node_gid) = Node.mass(node_gid);
+                    });
+
+                    break;
+                case node_state::temp:
+                    FOR_ALL(node_gid, 0, num_nodes, {
+                        node_scalar_fields(node_temp_id, node_gid) = Node.temp(node_gid);
+                    });
+
+                    break;
+
+                // vector fields
+
+                case node_state::coords:
+
+                    FOR_ALL(node_gid, 0, num_nodes, {
+
+                        node_vector_fields(node_coord_id, node_gid, 0) = Node.coords(node_gid, 0);
+                        node_vector_fields(node_coord_id, node_gid, 1) = Node.coords(node_gid, 1);
+                        if (num_dims == 2) {
+                            node_vector_fields(node_coord_id, node_gid, 2) = 0.0;
+                        }
+                        else{
+                            node_vector_fields(node_coord_id, node_coord_id, 2) = Node.coords(node_gid, 2);
+                        } // end if
+
+                    }); // end parallel for
+
+                    break;
+                case node_state::velocity:
+
+                    FOR_ALL(node_gid, 0, num_nodes, {
+
+                        // velocity, var is node_vel_id 
+                        node_vector_fields(node_vel_id, node_gid, 0) = Node.vel(node_gid, 0);
+                        node_vector_fields(node_vel_id, node_gid, 1) = Node.vel(node_gid, 1);
+                        if (num_dims == 2) {
+                            node_vector_fields(node_vel_id, node_gid, 2) = 0.0;
+                        }
+                        else{
+                            node_vector_fields(node_vel_id, node_gid, 2) = Node.vel(node_gid, 2);
+                        } // end if
+
+                        // accellerate, var is node_accel_id            
+                        node_vector_fields(node_accel_id, node_gid, 0) = (Node.vel(node_gid, 0) - Node.vel_n0(node_gid, 0))/dt;
+                        node_vector_fields(node_accel_id, node_gid, 1) = (Node.vel(node_gid, 1) - Node.vel_n0(node_gid, 1))/dt;
+                        if (num_dims == 2) {
+                            node_vector_fields(node_accel_id, node_gid, 2) = 0.0;
+                        }
+                        else{
+                            node_vector_fields(node_accel_id, node_gid, 2) = (Node.vel(node_gid, 2) - Node.vel_n0(node_gid, 2))/dt;
+                        } // end if
+
+                    }); // end parallel for
+
+                    break;
+                    
+                    
+                case node_state::gradient_level_set:
+
+                    FOR_ALL(node_gid, 0, num_nodes, {
+
+                        // velocity, var is node_vel_id 
+                        node_vector_fields(node_grad_level_set_id, node_gid, 0) = Node.gradient_level_set(node_gid, 0);
+                        node_vector_fields(node_grad_level_set_id, node_gid, 1) = Node.gradient_level_set(node_gid, 1);
+                        if (num_dims == 2) {
+                            node_vector_fields(node_grad_level_set_id, node_gid, 2) = 0.0;
+                        }
+                        else{
+                            node_vector_fields(node_grad_level_set_id, node_gid, 2) = Node.gradient_level_set(node_gid, 2);
+                        } // end if
+
+                    }); // end parallel for
+
+                    break;                
+                
+                // -- not used vars
+                case node_state::force:
+                    break;
+
+                // heat transer vars
+                case node_state::heat_transfer:
+                    break;
+                // tensors
+            } // end switch
+        } // end for over
+
+        
+
+    } // end function
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_vtu
+    ///
+    /// \brief Writes a vtu ASCII output file
+    ///
+    /// \param Simulation mesh
+    /// \param State data
+    /// \param Simulation parameters
+    /// \param current time value
+    /// \param Vector of all graphics output times
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_vtu(
+        const ViewCArray<double>& node_coords_host,
+        const ViewCArray<size_t>& nodes_in_elem_host,
+        const DCArrayKokkos<double>& elem_scalar_fields,
+        const DCArrayKokkos<double>& elem_tensor_fields,
+        const DCArrayKokkos<double>& node_scalar_fields,
+        const DCArrayKokkos<double>& node_vector_fields,
+        const std::vector<std::string>& elem_scalar_var_names,
+        const std::vector<std::string>& elem_tensor_var_names,
+        const std::vector<std::string>& node_scalar_var_names,
+        const std::vector<std::string>& node_vector_var_names,
+        const std::string partname,
+        const int graphics_id,
+        const size_t num_nodes,
+        const size_t num_elems,
+        const size_t num_nodes_in_elem,
+        const int Pn_order,
+        const size_t num_dims,
+        const size_t solver_id
+        )
+    {
+        FILE* out[20];   // the output files that are written to
+        char  filename[100]; // char string
+        int   max_len = sizeof filename;
+        int   str_output_len;
+
+        const size_t num_elem_scalar_vars = elem_scalar_var_names.size();
+        const size_t num_elem_tensor_vars = elem_tensor_var_names.size();
+
+        const size_t num_node_scalar_vars = node_scalar_var_names.size();
+        const size_t num_node_vector_vars = node_vector_var_names.size();
+
+
+        // create filename
+        str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%s.%05d.vtu", 
+                                                                 solver_id, partname.c_str(), graphics_id);
+
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+        // mesh file
+        
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "<?xml version=\"1.0\"?>\n");  
+        fprintf(out[0], "<VTKFile type=\"UnstructuredGrid\" version=\"1.0\" byte_order=\"LittleEndian\">\n"); 
+        fprintf(out[0], "  <UnstructuredGrid> \n");
+        fprintf(out[0], "    <Piece NumberOfPoints=\"%zu\" NumberOfCells=\"%zu\">\n", num_nodes, num_elems); 
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the mesh points
+        ---------------------------------------------------------------------------
+        */
+        fprintf(out[0], "\n");
+        fprintf(out[0], "      <!-- Define the mesh nodes -->\n");
+        fprintf(out[0], "      <Points>\n");
+        fprintf(out[0], "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n");
+
+        // write all components of the point coordinates
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            double coord_z = 0.0;
+            if(num_dims==3){
+                coord_z = node_coords_host(node_gid, 2);
+            } 
+            fprintf(out[0],
+                    "          %f %f %f\n",
+                    node_coords_host(node_gid, 0),
+                    node_coords_host(node_gid, 1),
+                    coord_z);
+        } // end for
+        fprintf(out[0], "        </DataArray>\n");
+        fprintf(out[0], "      </Points>\n");
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the elems
+        ---------------------------------------------------------------------------
+        */
+        fprintf(out[0], "\n");
+        fprintf(out[0], "      <!-- Define the elements -->\n");
+        fprintf(out[0], "      <Cells>\n");
+        fprintf(out[0], "        <DataArray type=\"Int32\" Name=\"connectivity\" format=\"ascii\">\n");  
+
+        // WARNING: look into high-order Pn 2D elements with paraview
+        int Pn_order_z = 0;
+        if (num_dims == 3){
+            Pn_order_z = Pn_order;
+        }
+        int order[3] = {Pn_order, Pn_order, Pn_order_z};
+
+        // const int num_1D_points = Pn_order+1;
+
+        // write all global point numbers for this elem
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            fprintf(out[0], "          ");  // adding indentation before printing nodes in element
+            if (num_dims==3 && Pn_order>1){
+                for (int k = 0; k <= Pn_order_z; k++) {
+                    for (int j = 0; j <= Pn_order; j++) {
+                        for (int i = 0; i <= Pn_order; i++) {
+                            size_t node_lid = PointIndexFromIJK(i, j, k, order);
+                            fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid));
+                        }
+                    }
+                } // end for
+            }
+            else if (num_dims == 3 && Pn_order == 1){
+               // 3D linear hexahedral elements
+                for (int node_lid = 0; node_lid < 8; node_lid++) {
+                    fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid));
+                } // end for
+            }
+            else if (num_dims == 2){
+                // 2D linear is the only supported option
+                for (int node_lid = 0; node_lid < 4; node_lid++) {
+                    fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid));
+                } // end for
+            }
+            else {
+                std::cout << "ERROR: outputs failed, dimensions and element types are not compatible \n";
+            } // end if
+            fprintf(out[0], "\n");
+        } // end for
+        fprintf(out[0], "        </DataArray>\n");
+
+        // Write the element offsets
+        fprintf(out[0], "        <DataArray type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");  
+        size_t count=0;
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            count += num_nodes_in_elem;
+            fprintf(out[0], "          %lu\n", count); // num points in this elem + all others before it
+        } // end for
+        fprintf(out[0], "        </DataArray>\n");
+
+
+        // Write the element types
+        fprintf(out[0], "        <DataArray type=\"Int8\" Name=\"types\" format=\"ascii\">\n"); 
+        // ----
+        // linear element types
+        //   VTK_PIXEL = 8,   linear 2D quad with i,j,k indexing (future format for 2D solver)
+        //   VTK_Quad = 9,    linear 2D quad with ensight index ordering (current 2D rz convention)
+        //   VTK_VOXEL = 11,  linear 3D hex with i,j,k indexing (current format)
+        // arbitrary order types
+        //   VTK_LAGRANGE_QUADRILATERAL = 70, use this type when a 2D high-order scheme exists
+        //   VTK_LAGRANGE_HEXAHEDRON: 72, this is the current 3D high-order 
+        //   VTK_HIGHER_ORDER_HEXAHEDRON: 67
+        //   VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
+        // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
+        // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
+        // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            if (num_dims==3 && Pn_order>1){
+                fprintf(out[0], "          %d \n", 72);
+            }
+            else if (num_dims == 3 && Pn_order == 1){
+                // 3D linear hex
+                fprintf(out[0], "          %d \n", 11);
+            }
+            else {
+                // 2D ensight mesh ordering
+                fprintf(out[0], "          %d \n", 9);
+            }
+        }
+        fprintf(out[0], "        </DataArray>\n");
+        fprintf(out[0], "      </Cells>\n");
+
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the nodal variables to file
+        ---------------------------------------------------------------------------
+        */
+        // vtk vector vars = (position, velocity)
+        fprintf(out[0], "\n");
+        fprintf(out[0], "      <!-- Define the node vector data -->\n");
+        if(num_node_vector_vars >0 || num_node_scalar_vars>0){
+
+            fprintf(out[0], "      <PointData>\n");
+
+            // node vectors
+            for (int a_var = 0; a_var < num_node_vector_vars; a_var++) {
+                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\" format=\"ascii\">\n", node_vector_var_names[a_var].c_str());
+               
+                for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+                    fprintf(out[0], "          %f %f %f\n",
+                            node_vector_fields.host(a_var, node_gid, 0),
+                            node_vector_fields.host(a_var, node_gid, 1),
+                            node_vector_fields.host(a_var, node_gid, 2));
+                } // end for nodes
+                fprintf(out[0], "        </DataArray>\n");
+
+            } // end for vec_vars
+
+
+            // node scalar vars
+            for (int a_var = 0; a_var < num_node_scalar_vars; a_var++) {
+                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", node_scalar_var_names[a_var].c_str());
+                for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+                    fprintf(out[0], "          %f\n", node_scalar_fields.host(a_var, node_gid));
+                } // end for nodes
+                fprintf(out[0], "        </DataArray>\n");
+            } // end for vec_vars
+
+            fprintf(out[0], "      </PointData>\n");
+
+        } // end if
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the elem variables to file
+        ---------------------------------------------------------------------------
+        */
+        fprintf(out[0], "\n");
+        fprintf(out[0], "      <!-- Define the cell data -->\n");
+        if(num_elem_scalar_vars >0 || num_elem_tensor_vars>0){
+
+            fprintf(out[0], "      <CellData>\n");
+
+            for (int a_var = 0; a_var < num_elem_scalar_vars; a_var++) {
+
+                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", elem_scalar_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4]
+
+                for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                    fprintf(out[0], "          %f\n", elem_scalar_fields.host(a_var, elem_gid));
+                } // end for elem
+                fprintf(out[0], "        </DataArray>\n");
+            } // end for elem scalar_vars
+
+
+            // tensors
+            for (int a_var = 0; a_var < num_elem_tensor_vars; a_var++) {
+                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"9\" format=\"ascii\">\n", elem_tensor_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4]
+                
+                for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                    // note: paraview is row-major, CArray convention
+                    // Txx  Txy  Txz  Tyx  Tyy  Tyz  Tzx  Tzy  Tzz
+                    for (size_t i=0; i<3; i++){
+                        for(size_t j=0; j<3; j++){
+                            fprintf(out[0], "          %f ", elem_tensor_fields.host(a_var, elem_gid, i, j));
+                        } // end j
+                    } // end i
+                } // end for elem
+                fprintf(out[0], "\n");
+                fprintf(out[0], "        </DataArray>\n");
+            } // end for elem scalar_vars
+
+            fprintf(out[0], "      </CellData>\n");
+        } // end if
+
+        // end of the vtu file
+        fprintf(out[0], "    </Piece>\n");
+        fprintf(out[0], "  </UnstructuredGrid>\n");
+        fprintf(out[0], "</VTKFile>\n");
+        
+        //-----------------
+        // close the vtu file for element fields
+        //-----------------
+        fclose(out[0]);
+
+    } // end write vtu
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_pvd
+    ///
+    /// \brief Writes a pvd ASCII output file for the element and nodal fields
+    ///
+    /// \param Vector of all graphics output times
+    /// \param element average field names
+    /// \param current time value
+    /// \param graphics index
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_pvd(CArray<double>& graphics_times,
+                   double time_value,
+                   int graphics_id,
+                   const size_t solver_id){
+
+        FILE* out[20];   // the output files that are written to
+        char  filename[100]; // char string
+        int   max_len = sizeof filename;
+        int   str_output_len;
+
+        // Write time series metadata
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.solver%zu.pvd", solver_id); 
+
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+        // mesh file
+
+        out[0] = fopen(filename, "w");
+ 
+        fprintf(out[0], "<?xml version=\"1.0\"?>\n");
+        fprintf(out[0], "<VTKFile type=\"Collection\" version=\"1.0\" byte_order=\"LittleEndian\">\n");
+        fprintf(out[0], "  <Collection>\n");
+
+        for (int i = 0; i <= graphics_id; i++) {
+            fprintf(out[0], "    <DataSet timestep=\"%12.5e\" file=\"data/Fierro.solver%zu.%05d.vtm\" time= \"%12.5e\" />\n", 
+                                                     graphics_times(i), solver_id, i, graphics_times(i) );
+            //fprintf(out[0], "    <DataSet timestep=\"%d\" file=\"data/Fierro.solver%zu.%05d.vtm\" time= \"%12.5e\" />\n", 
+            //                                         i, solver_id, i, graphics_times(i) );
+        }
+
+        fprintf(out[0], "  </Collection>\n");
+        fprintf(out[0], "</VTKFile>"); 
+
+        fclose(out[0]);
+
+    } // end pvd
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_vtm
+    ///
+    /// \brief Writes a vtm ASCII output file for all fields -- mesh and material
+    ///
+    /// \param Vector of all graphics output times
+    /// \param element average field names
+    /// \param current time value
+    /// \param graphics index
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_vtm(CArray<double>& graphics_times,
+                   const  std::string& elem_part_name,
+                   const  std::string& mat_part_name,
+                   double time_value,
+                   int graphics_id,
+                   int num_mats,
+                   bool write_mesh_state,
+                   bool write_mat_pt_state,
+                   const size_t solver_id)
+    {
+        // loop over all the files that were written 
+        for(int file_id=0; file_id<=graphics_id; file_id++){
+
+            FILE* out[20];   // the output files that are written to
+            char  filename[100]; // char string
+            int   max_len = sizeof filename;
+            int   str_output_len;
+
+
+            // Write time series metadata to the data file
+            str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%05d.vtm", solver_id, file_id); 
+
+            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+            // mesh file
+
+            out[0] = fopen(filename, "w");
+    
+            fprintf(out[0], "<?xml version=\"1.0\"?>\n");
+            fprintf(out[0], "<VTKFile type=\"vtkMultiBlockDataSet\" version=\"1.0\" byte_order=\"LittleEndian\" header_type=\"UInt64\">\n");
+            fprintf(out[0], "  <vtkMultiBlockDataSet>\n");
+
+            
+            // Average mesh fields -- node and elem state written
+            size_t block_id = 0;  // this will need to be incremented based on the number of mesh fields written
+            if (write_mesh_state){
+                fprintf(out[0], "    <Block index=\"%zu\" name=\"Mesh\">\n", block_id);
+                {
+                    block_id++;  // increment block id for material outputs that follow the element avg block
+
+                    // elem and nodal fields are in this file
+                    fprintf(out[0], "      <Piece index=\"0\" name=\"Field\">\n");
+                    fprintf(out[0], "        <DataSet timestep=\"%d\" file=\"Fierro.solver%zu.%s.%05d.vtu\" time= \"%12.5e\" />\n", 
+                                                              file_id, solver_id, elem_part_name.c_str(), file_id, graphics_times(file_id) );
+                    fprintf(out[0], "      </Piece>\n");
+
+                    // add other Mesh average output Pieces here
+                }
+                fprintf(out[0], "    </Block>\n");
+            } // end if write elem and node state is true
+
+            // note: the block_id was incremented if an element average field output was made
+            if (write_mat_pt_state){
+                fprintf(out[0], "    <Block index=\"%zu\" name=\"Mat\">\n", block_id);
+                for (size_t mat_id=0; mat_id<num_mats; mat_id++){
+                    
+                    // output the material specific fields
+                    fprintf(out[0], "      <Piece index=\"%zu\" name=\"Mat%zu\">\n", mat_id, mat_id);
+                    fprintf(out[0], "        <DataSet timestep=\"%d\" file=\"Fierro.solver%zu.%s%zu.%05d.vtu\" time= \"%12.5e\" />\n", 
+                                                               file_id, solver_id, mat_part_name.c_str(), mat_id, file_id, graphics_times(file_id) );
+                    fprintf(out[0], "      </Piece>\n");
+
+                } // end for loop mat_id
+                fprintf(out[0], "    </Block>\n");
+            } // end if write mat satte is true
+
+            // done writing the files to be read by the vtm file
+            fprintf(out[0], "  </vtkMultiBlockDataSet>\n");
+            fprintf(out[0], "</VTKFile>"); 
+
+            fclose(out[0]);
+
+        } // end for file_id
+
+    } // end vtm
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_material_elem_node_lists
+    ///
+    /// \brief Creates elems and nodes for a unique mesh of a material (i.e, a part)
+    ///
+    /// \param Simulation mesh
+    /// \param State node data
+    /// \param Material node coordinates
+    /// \param Material nodes in the material element
+    /// \param Material to mesh map for elements
+    /// \param number of material nodes
+    /// \param number of material elements
+    /// \param number of nodes in the element
+    /// \param number of dimensions 
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_material_elem_node_lists(
+        const Mesh_t& mesh,
+        const DCArrayKokkos<double>& state_node_coords,
+        DCArrayKokkos<double>& mat_node_coords,
+        DCArrayKokkos <size_t>& mat_nodes_in_mat_elem,
+        const DRaggedRightArrayKokkos<size_t>& elem_in_mat_elem,
+        const size_t mat_id,
+        size_t& num_mat_nodes,
+        const size_t num_mat_elems,
+        const size_t num_nodes_in_elem,
+        const size_t num_dims)
+    {
+
+        // helper arrays
+        DCArrayKokkos <size_t> dummy_counter(mesh.num_nodes, "dummy_counter");
+        DCArrayKokkos <size_t> access_mat_node_gids(mesh.num_nodes, "access_mat_node_gids");
+        dummy_counter.set_values(0);
+
+        // tag and count the number of nodes in this part
+        FOR_ALL (mat_elem_sid, 0, num_mat_elems, {
+            // get elem gid
+            size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);  // WARNING not GPU compatible
+            
+            // parallel loop over the nodes in the element
+            for(size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++) {
+                size_t node_gid = mesh.nodes_in_elem(elem_gid, node_lid);
+
+                Kokkos::atomic_add(&dummy_counter(node_gid), 1); // values in part will be >0
+
+            } // end for nodes in element
+            
+        }); // end parallel for
+        Kokkos::fence();
+        dummy_counter.update_host();
+
+        // loop opperation is not thread safe, must be run serially
+        size_t mat_node_gid = 0;
+        for(size_t node_gid = 0; node_gid<mesh.num_nodes; node_gid ++) {
+            
+            // save the nodes on the part (i.e., that belong to the material)
+            if (dummy_counter.host(node_gid)>0){
+                mat_node_coords.host(mat_node_gid, 0) = state_node_coords.host(node_gid, 0);
+                mat_node_coords.host(mat_node_gid, 1) = state_node_coords.host(node_gid, 1);
+                if (num_dims == 3){ 
+                    mat_node_coords.host(mat_node_gid, 2) = state_node_coords.host(node_gid, 2);
+                } // end if on dims
+
+                access_mat_node_gids.host(node_gid) = mat_node_gid; // the part node id
+
+                mat_node_gid ++;
+
+                dummy_counter.host(node_gid) = 0; // set counter to zero, it was accounted for
+            } // end if this node is on the part
+
+        } // end loop over all mesh nodes
+        mat_node_coords.update_device();
+        access_mat_node_gids.update_device();
+        dummy_counter.update_device();
+        Kokkos::fence();
+
+        // save the number of nodes defining the material region, i.e., the part
+        num_mat_nodes = mat_node_gid;
+        
+        // save the new node id's
+        FOR_ALL (mat_elem_sid, 0, num_mat_elems, {
+            // get elem gid
+            size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
+            
+            // parallel loop over the nodes in the element
+            for(size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++) {
+                size_t node_gid = mesh.nodes_in_elem(elem_gid, node_lid);
+
+                // save the mat_node to the mat elem list
+                mat_nodes_in_mat_elem(mat_elem_sid, node_lid) = access_mat_node_gids(node_gid);
+
+            } // end for nodes in element
+            
+        }); // end parallel for
+        Kokkos::fence();
+        mat_nodes_in_mat_elem.update_host();
+
+    } // end build part (i.e., material elem and point lists) function
+
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_material_point_state
+    ///
+    /// \brief Writes a state output file at each material point
+    ///
+    /// \param Simulation mesh
+    /// \param State data
+    /// \param Simulation parameters
+    /// \param current time value
+    /// \param Vector of all graphics output times
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_material_point_state(Mesh_t& mesh,
+        State_t& State,
+        SimulationParameters_t& SimulationParamaters,
+        double time_value,
+        CArray<double> graphics_times,
+        std::vector<node_state> node_states,
+        std::vector<gauss_pt_state> gauss_pt_states,
+        std::vector<material_pt_state> material_pt_states)
+    {
+        // WARNING WARNING WARNING:
+        // This currently assumes the gauss and material point IDs are the same as the element ID
+        // This will need to be updated for high order methods
+
+        // Update host data
+        // ---- Update host data ----
+        size_t num_mats = State.MaterialPoints.num_material_points.size();
+
+        State.MaterialPoints.den.update_host();
+        State.MaterialPoints.pres.update_host();
+        State.MaterialPoints.stress.update_host();
+        State.MaterialPoints.sspd.update_host();
+        State.MaterialPoints.sie.update_host();
+        State.MaterialPoints.mass.update_host();
+
+        State.GaussPoints.vol.update_host();
+
+        State.node.coords.update_host();
+        State.node.vel.update_host();
+        State.node.mass.update_host();
+
+        Kokkos::fence();
+
+        struct stat st;
+
+        if (stat("state", &st) != 0) {
+            system("mkdir state");
+        }
+
+        size_t num_dims = mesh.num_dims;
+
+        //  ---------------------------------------------------------------------------
+        //  Setup of file and directory for exporting
+        //  ---------------------------------------------------------------------------
+
+        // output file
+        FILE* out_elem_state;  // element average state
+        char  filename[128];
+
+        int max_len = sizeof filename;
+
+        snprintf(filename, max_len, "state/mat_pt_state_t_%6.4e.txt", time_value);
+
+        // output files
+        out_elem_state = fopen(filename, "w");
+
+        // write state dump
+        fprintf(out_elem_state, "# state dump file\n");
+        fprintf(out_elem_state, "# x  y  z  radius_2D  radius_3D  den  pres  sie  sspd  vol  mass \n");
+
+        // write out values for the elem
+        for (size_t mat_id = 0; mat_id < num_mats; mat_id++) {
+
+            size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
+
+            for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++)
+            {
+
+                const size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid);
+
+                double elem_coords[3];
+                elem_coords[0] = 0.0;
+                elem_coords[1] = 0.0;
+                elem_coords[2] = 0.0;
+
+                // get the coordinates of the element center
+                for (size_t node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
+
+                    elem_coords[0] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 0);
+                    elem_coords[1] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 1);
+                    if (num_dims == 3) {
+                        elem_coords[2] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 2);
+                    }
+                    else{
+                        elem_coords[2] = 0.0;
+                    }
+                } // end loop over nodes in element
+
+                elem_coords[0] = elem_coords[0] / ((double)mesh.num_nodes_in_elem);
+                elem_coords[1] = elem_coords[1] / ((double)mesh.num_nodes_in_elem);
+                elem_coords[2] = elem_coords[2] / ((double)mesh.num_nodes_in_elem);
+
+                double rad2 = sqrt(elem_coords[0] * elem_coords[0] +
+                                   elem_coords[1] * elem_coords[1]);
+
+                double rad3 = sqrt(elem_coords[0] * elem_coords[0] +
+                                   elem_coords[1] * elem_coords[1] +
+                                   elem_coords[2] * elem_coords[2]);
+
+
+                fprintf(out_elem_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t \n",
+                         elem_coords[0],
+                         elem_coords[1],
+                         elem_coords[2],
+                         rad2,
+                         rad3,
+                         State.MaterialPoints.den.host(mat_id, mat_elem_sid),
+                         State.MaterialPoints.pres.host(mat_id, mat_elem_sid),
+                         State.MaterialPoints.sie.host(mat_id, mat_elem_sid),
+                         State.MaterialPoints.sspd.host(mat_id, mat_elem_sid),
+                         State.GaussPoints.vol.host(elem_gid),
+                         State.MaterialPoints.mass.host(mat_id, mat_elem_sid) );
+
+            } // end for elements
+
+        } // end for materials
+        fclose(out_elem_state);
+
+
+
+        // printing nodal state
+            
+        FILE* out_point_state;  // element average state
+
+        snprintf(filename, max_len, "state/node_state_t_%6.4e.txt", time_value);
+
+        // output files
+        out_point_state = fopen(filename, "w");
+
+        // write state dump
+        fprintf(out_point_state, "# state node dump file\n");
+        fprintf(out_point_state, "# x  y  z  radius_2D  radius_3D  vel_x  vel_y  vel_z  speed  ||err_v_dot_r|| \n");
+
+        // get the coordinates of the node
+        for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+
+            double node_coords[3];
+
+            node_coords[0] = State.node.coords.host(node_gid, 0);
+            node_coords[1] = State.node.coords.host(node_gid, 1);
+            if (num_dims == 3) {
+                node_coords[2] = State.node.coords.host(node_gid, 2);
+            }
+            else{
+                node_coords[2] = 0.0;
+            }
+
+            double rad2 = sqrt(node_coords[0] * node_coords[0] +
+                               node_coords[1] * node_coords[1]);
+            double rad3 = sqrt(node_coords[0] * node_coords[0] +
+                               node_coords[1] * node_coords[1] +
+                               node_coords[2] * node_coords[2]);
+
+            double node_vel[3];
+
+           node_vel[0] = State.node.vel.host(node_gid, 0);
+           node_vel[1] = State.node.vel.host(node_gid, 1);
+            if (num_dims == 3) {
+               node_vel[2] = State.node.vel.host(node_gid, 2);
+            }
+            else{
+               node_vel[2] = 0.0;
+            }
+
+            double speed = sqrt(node_vel[0] * node_vel[0] +
+                                node_vel[1] * node_vel[1] +
+                                node_vel[2] * node_vel[2]);
+
+
+
+            // looking at perfect radial motion
+            double unit_r_vec[2];
+            unit_r_vec[0] = node_coords[0]/rad2;
+            unit_r_vec[1] = node_coords[1]/rad2;
+
+            //the radial motion
+            double v_dot_r = node_vel[0] * unit_r_vec[0] +
+                             node_vel[1] * unit_r_vec[1];
+            
+
+            double err_v_dot_r[3]; 
+            err_v_dot_r[0] = node_vel[0]-unit_r_vec[0]*v_dot_r;
+            err_v_dot_r[1] = node_vel[1]-unit_r_vec[1]*v_dot_r;
+
+            double mag_err_v_dot_r = sqrt(err_v_dot_r[0]*err_v_dot_r[0] + err_v_dot_r[1]*err_v_dot_r[1]);
+
+            fprintf(out_point_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t  %4.12e\t %4.12e\t \n",
+                         node_coords[0],
+                         node_coords[1],
+                         node_coords[2],
+                         rad2,
+                         rad3,
+                         node_vel[0],
+                         node_vel[1],
+                         node_vel[2],
+                         speed,
+                         mag_err_v_dot_r);
+
+
+        } // end loop over nodes in element
+
+
+        fclose(out_point_state);
+
+
+        return;
+    } // end of state output
+}; // end class
+
+#endif // end Header Guard
\ No newline at end of file

From c2e3ce57b06da6752b3080b95ddf1bffdfe3dd46 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 22 Oct 2025 15:12:30 -0500
Subject: [PATCH 02/52] ENH: Adding mesh decomposition example WIP

---
 .gitignore                               |    3 +-
 examples/CMakeLists.txt                  |  143 +-
 examples/mesh_decomp/CMakeLists.txt      |   23 +-
 examples/mesh_decomp/install_ptscotch.sh |    6 +-
 examples/mesh_decomp/mesh.h              |    1 -
 examples/mesh_decomp/mesh_decomp.cpp     |   57 +-
 examples/mesh_decomp/mesh_inputs.h       |  141 +
 examples/mesh_decomp/mesh_io.h           | 4898 +---------------------
 examples/mesh_decomp/state.h             |  139 +
 scripts/build-matar.sh                   |    2 +-
 10 files changed, 514 insertions(+), 4899 deletions(-)
 create mode 100644 examples/mesh_decomp/mesh_inputs.h
 create mode 100644 examples/mesh_decomp/state.h

diff --git a/.gitignore b/.gitignore
index fbdfa9d3..87400105 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ heffte/
 docs_doxygen/
 docs_sphinx/
 tutorial/getting_started/Example0/build_*
-tutorial/getting_started/Example0/install*
\ No newline at end of file
+tutorial/getting_started/Example0/install*
+examples/mesh_decomp/lib/*
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index affcd031..e32ddb2d 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -88,108 +88,111 @@ if (KOKKOS)
     add_definitions(-DHAVE_THREADS=1)
   endif()
 
-  add_executable(testsetval test_set_values.cpp)
-  target_link_libraries(testsetval ${LINKING_LIBRARIES})
+  # add_executable(testsetval test_set_values.cpp)
+  # target_link_libraries(testsetval ${LINKING_LIBRARIES})
 
-  add_executable(mtestkokkos main_kokkos.cpp)
-  target_link_libraries(mtestkokkos ${LINKING_LIBRARIES})
+  # add_executable(mtestkokkos main_kokkos.cpp)
+  # target_link_libraries(mtestkokkos ${LINKING_LIBRARIES})
 
 
-  add_executable(drrak_test test_drrak.cpp)
-  target_link_libraries(drrak_test ${LINKING_LIBRARIES})
+  # add_executable(drrak_test test_drrak.cpp)
+  # target_link_libraries(drrak_test ${LINKING_LIBRARIES})
 
-  add_executable(test_kokkos_for kokkos_for.cpp)
-  target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES})
+  # add_executable(test_kokkos_for kokkos_for.cpp)
+  # target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES})
 
-  add_executable(test_dual_types test_dual_types.cpp)
-  target_link_libraries(test_dual_types ${LINKING_LIBRARIES})
+  # add_executable(test_dual_types test_dual_types.cpp)
+  # target_link_libraries(test_dual_types ${LINKING_LIBRARIES})
 
-  add_executable(kokkos_csr CSRKokkos.cpp)
-  target_link_libraries(kokkos_csr ${LINKING_LIBRARIES})
+  # add_executable(kokkos_csr CSRKokkos.cpp)
+  # target_link_libraries(kokkos_csr ${LINKING_LIBRARIES})
 
-  add_executable(kokkos_csc CSCKokkos.cpp)
-  target_link_libraries(kokkos_csc ${LINKING_LIBRARIES})
+  # add_executable(kokkos_csc CSCKokkos.cpp)
+  # target_link_libraries(kokkos_csc ${LINKING_LIBRARIES})
 
-  add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp)
-  target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES})
+  # add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp)
+  # target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES})
 
-  add_executable(annkokkos ann_kokkos.cpp)
-  target_link_libraries(annkokkos ${LINKING_LIBRARIES})
+  # add_executable(annkokkos ann_kokkos.cpp)
+  # target_link_libraries(annkokkos ${LINKING_LIBRARIES})
 
-  add_executable(annkokkos_compare ann_kokkos_compare.cpp)
-  target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES})
+  # add_executable(annkokkos_compare ann_kokkos_compare.cpp)
+  # target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES})
 
-  #add_executable(ompperftest ompperftest.cpp)
-  #target_link_libraries(ompperftest ${LINKING_LIBRARIES})
+  # #add_executable(ompperftest ompperftest.cpp)
+  # #target_link_libraries(ompperftest ${LINKING_LIBRARIES})
 
-  add_executable(lu_test test_lu_solve.cpp)
-  target_link_libraries(lu_test ${LINKING_LIBRARIES})
+  # add_executable(lu_test test_lu_solve.cpp)
+  # target_link_libraries(lu_test ${LINKING_LIBRARIES})
 
-  add_executable(qr_test test_qr_solve.cpp)
-  target_link_libraries(qr_test ${LINKING_LIBRARIES})
+  # add_executable(qr_test test_qr_solve.cpp)
+  # target_link_libraries(qr_test ${LINKING_LIBRARIES})
 
-  if (Matar_ENABLE_TRILINOS)
-    add_executable(anndistributed ann_distributed.cpp)
-    target_link_libraries(anndistributed ${LINKING_LIBRARIES})
+  # if (Matar_ENABLE_TRILINOS)
+  #   add_executable(anndistributed ann_distributed.cpp)
+  #   target_link_libraries(anndistributed ${LINKING_LIBRARIES})
     
-    add_executable(anndistributed_crs ann_distributed_crs.cpp)
-    target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES})
+  #   add_executable(anndistributed_crs ann_distributed_crs.cpp)
+  #   target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES})
 
-    add_executable(test_tpetra_farray test_tpetra_farray.cpp)
-    target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES})
+  #   add_executable(test_tpetra_farray test_tpetra_farray.cpp)
+  #   target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES})
 
-    add_executable(test_tpetra_carray test_tpetra_carray.cpp)
-    target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES})
+  #   add_executable(test_tpetra_carray test_tpetra_carray.cpp)
+  #   target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES})
 
-    add_executable(test_tpetra_crs test_tpetra_crs.cpp)
-    target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES})
+  #   add_executable(test_tpetra_crs test_tpetra_crs.cpp)
+  #   target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES})
 
-    add_executable(test_tpetra_mesh test_tpetra_mesh.cpp)
-    target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES})
-  endif()
+  #   add_executable(test_tpetra_mesh test_tpetra_mesh.cpp)
+  #   target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES})
+  # endif()
 
-  if (OPENMP)
-    add_executable(parallel_hello_world parallel_hello_world.cpp)
-    target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES})
-  endif()
+  # if (OPENMP)
+  #   add_executable(parallel_hello_world parallel_hello_world.cpp)
+  #   target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES})
+  # endif()
 
-  if (MPI)
-    include_directories(laplaceMPI)
-    add_subdirectory(laplaceMPI)
-  endif()
+  # if (MPI)
+  #   include_directories(laplaceMPI)
+  #   add_subdirectory(laplaceMPI)
+  # endif()
 
 endif()
 
-### HIP Linking error, will add back in after fixed
-if (NOT HIP)
-    include_directories(virtualFcnKokkos)
-    add_subdirectory(virtualFcnKokkos)
-endif()
+# ### HIP Linking error, will add back in after fixed
+# if (NOT HIP)
+#     include_directories(virtualFcnKokkos)
+#     add_subdirectory(virtualFcnKokkos)
+# endif()
+
+# # In testing, not working
+# #include_directories(gArrayofgArrays)
+# #add_subdirectory(gArrayofgArrays)
 
-# In testing, not working
-#include_directories(gArrayofgArrays)
-#add_subdirectory(gArrayofgArrays)
+# include_directories(virtualFcnMATAR)
+# add_subdirectory(virtualFcnMATAR)
 
-include_directories(virtualFcnMATAR)
-add_subdirectory(virtualFcnMATAR)
+# include_directories(laplace)
+# add_subdirectory(laplace)
 
-include_directories(laplace)
-add_subdirectory(laplace)
+# include_directories(halfspace_cooling)
+# add_subdirectory(halfspace_cooling)
 
-include_directories(halfspace_cooling)
-add_subdirectory(halfspace_cooling)
+# include_directories(watt-graph)
+# add_subdirectory(watt-graph)
 
-include_directories(watt-graph)
-add_subdirectory(watt-graph)
+# #include_directories(matar_fortran)
+# #add_subdirectory(matar_fortran)
 
-#include_directories(matar_fortran)
-#add_subdirectory(matar_fortran)
+# include_directories(sparsetests)
+# add_subdirectory(sparsetests)
 
-include_directories(sparsetests)
-add_subdirectory(sparsetests)
+# include_directories(test_rocm)
+# add_subdirectory(test_rocm)
 
-include_directories(test_rocm)
-add_subdirectory(test_rocm)
+include_directories(mesh_decomp)
+add_subdirectory(mesh_decomp)
 
 #include_directories(phaseField/srcKokkosVerbose)
 #add_subdirectory(phaseField/srcKokkosVerbose)
diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt
index 721859a8..b002a355 100644
--- a/examples/mesh_decomp/CMakeLists.txt
+++ b/examples/mesh_decomp/CMakeLists.txt
@@ -1,13 +1,32 @@
 cmake_minimum_required(VERSION 3.1.3)
 
+# Find MPI
+find_package(MPI REQUIRED)
+
 find_package(Matar REQUIRED)
 
+execute_process(
+  COMMAND ${CMAKE_CURRENT_LIST_DIR}/install_ptscotch.sh
+  WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+  RESULT_VARIABLE INSTALL_PTSCOTCH_RESULT
+)
+
+if(NOT INSTALL_PTSCOTCH_RESULT EQUAL 0)
+  message(FATAL_ERROR "Failed to install PT-Scotch by running install_ptscotch.sh")
+endif()
+
+
 if (KOKKOS)
   #find_package(Kokkos REQUIRED) #new
   
-  add_executable(mech_decomp mesh_decomp.cpp)
+  add_executable(mesh_decomp mesh_decomp.cpp)
 
   add_definitions(-DHAVE_KOKKOS=1)
 
-  target_link_libraries(mesh_decomp ${LINKING_LIBRARIES})
+  # Add include directories for MPI and Scotch/PT-Scotch
+  target_include_directories(mesh_decomp PRIVATE ${MPI_CXX_INCLUDE_PATH} ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/include)
+  
+  # Link libraries
+  target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX)
+  target_link_directories(mesh_decomp PRIVATE ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/lib)
 endif()
diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh
index 95ad7914..00d29df9 100755
--- a/examples/mesh_decomp/install_ptscotch.sh
+++ b/examples/mesh_decomp/install_ptscotch.sh
@@ -11,8 +11,10 @@ LIB_DIR="lib"
 
 # echo "Installing Scotch and PT-Scotch to ${INSTALL_PREFIX}"
 
-# Create lib directory
-mkdir -p "${LIB_DIR}"
+# Create lib directory if it doesn't exist
+if [ ! -d "${LIB_DIR}" ]; then
+    mkdir -p "${LIB_DIR}"
+fi
 cd ${LIB_DIR}
 # Clone and build Scotch
 echo "Cloning Scotch..."
diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
index 599cb77d..9a7140a3 100644
--- a/examples/mesh_decomp/mesh.h
+++ b/examples/mesh_decomp/mesh.h
@@ -36,7 +36,6 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "matar.h"
 #include "state.h"
-#include "ref_elem.h"
 #include <cmath>
 
 #define PI 3.141592653589793
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index dd26b631..595ab4e0 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -5,28 +5,67 @@
 #include <memory>
 #include <mpi.h>
 
+
+#include "mesh.h"
+#include "state.h"
+#include "mesh_io.h"
+
 // Include Scotch headers
 #include "scotch.h"
 #include "ptscotch.h"
 
 
-struct initial_mesh_t {
-    int num_elems;                    // Number of elements
+
+
+int main(int argc, char** argv) {
+
+    MPI_Init(&argc, &argv);
+    MATAR_INITIALIZE(argc, argv);
+    { // MATAR scope
+
+    int world_size;
+    int rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+
+
+    // Create mesh, gauss points, and node data structures on each rank
+    Mesh_t mesh;
+    GaussPoint_t GaussPoints;
+    node_t node;
+
+
+    if (rank == 0) {
+        std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
+        std::cout<<"World size: "<<world_size<<std::endl;
+
+        Mesh_t initial_mesh;
+        GaussPoint_t initial_GaussPoints;
+        node_t initial_node;
     
-    std::vector<SCOTCH_Num> nodes_in_elem;  // Nodes in an element
-    std::vector<SCOTCH_Num> elems_in_elem;  // Elements in an element
+        double origin[3] = {0.0, 0.0, 0.0};
+        double length[3] = {1.0, 1.0, 1.0};
+        int num_elems[3] = {10, 10, 10};
     
-    std::vector<SCOTCH_Num> verttab;  // Start index in edgetab for each element (size num_elems+1)
-    std::vector<SCOTCH_Num> edgetab;  // Adjacency info: neighboring element indices
-};
+        std::cout<<"Initializing mesh"<<std::endl;
+        build_3d_box(initial_mesh, initial_GaussPoints, initial_node, origin, length, num_elems);
 
 
-int main(int argc, char** argv) {
 
-    initial_mesh_t initial_mesh;
 
+    }
 
 
+    
 
+
+
+
+    if (rank == 0) std::cout<<"Finished decomposition"<<std::endl;
+    
+    } // end MATAR scope
+    MATAR_FINALIZE();
+    MPI_Finalize();
     return 0;
 }
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_inputs.h b/examples/mesh_decomp/mesh_inputs.h
new file mode 100644
index 00000000..e7619748
--- /dev/null
+++ b/examples/mesh_decomp/mesh_inputs.h
@@ -0,0 +1,141 @@
+/**********************************************************************************************
+� 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+
+#ifndef FIERRO_MESH_INPUT_OPTIONS_H
+#define FIERRO_MESH_INPUT_OPTIONS_H
+
+#include <stdio.h>
+#include "matar.h"
+
+namespace mesh_input
+{
+// source of the mesh
+enum source
+{
+    none = 0,       ///< No source given, should fail
+    generate = 1,   ///< Create the mesh using the mesh builder
+    file = 2,       ///< Read in the mesh from a file
+};
+
+// type of mesh to generate if source = generate
+enum type
+{
+    Box = 0,     // Create the mesh using the mesh builder
+    Polar = 1,   // Create a polar 2D mesh
+};
+} // end of namespace
+
+static std::map<std::string, mesh_input::source> mesh_input_source_map
+{
+    { "generate", mesh_input::generate },
+    { "file", mesh_input::file }
+};
+
+static std::map<std::string, mesh_input::type> mesh_input_type_map
+{
+    { "box", mesh_input::Box },
+    { "polar", mesh_input::Polar }
+};
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct mesh_input_t
+///
+/// \brief Meshing related input parameters
+///
+/////////////////////////////////////////////////////////////////////////////
+struct mesh_input_t
+{
+    int num_dims = 3;   ///< Number of dimensions for the mesh
+    mesh_input::source source = mesh_input::none;   ///< Source of mesh, file or generate
+    std::string file_path     = ""; ///< Absolute path of mesh file
+    mesh_input::type type;          ///< Type of mesh to generate if
+
+    double origin[3]    = { 0.0, 0.0, 0.0 }; ///< Mesh origin for generating a mesh
+    double length[3]    = { 0.0, 0.0, 0.0 }; ///< x,y,z length of generated mesh
+    size_t num_elems[3] = { 1, 1, 1 }; ///< Number of elements along x,y, z for generating a mesh.
+
+    size_t p_order = 1;
+
+    // WARNING, NOT YET PARSED
+    double inner_radius   = 0.0;     ///< Inner radius for generating 2D RZ mesh
+    double outer_radius   = 1.0;     ///< Outer radius for generating 2D RZ mesh
+    double starting_angle = 0.0;     ///< Starting angle in degrees for 2D RZ mesh
+    double ending_angle   = 90;      ///< Ending angle in degrees for 2D RZ mesh
+
+    int num_radial_elems  = 10;     ///< Number of elements in the radial direction for 2DRZ mesh
+    int num_angular_elems = 10;     ///< Number of elements in the radial direction for 2DRZ mesh
+
+    double scale_x = 1.0; ///< Scales mesh x coordinate dimensions
+    double scale_y = 1.0; ///< Scales mesh y coordinate dimensions
+    double scale_z = 1.0; ///< Scales mesh z coordinate dimensions
+
+    DCArrayKokkos <int> object_ids; ///< the object_ids in the vtu full mesh file (from exodus mesh)  
+
+}; // mesh_input_t
+
+// ----------------------------------
+// valid inputs for mesh options
+// ----------------------------------
+static std::vector<std::string> str_mesh_inps
+{
+    "num_dims",
+    "source",
+    "file_path",
+    "type",
+    "origin",
+    "length",
+    "num_elems",
+    "polynomial_order",
+    "inner_radius",
+    "outer_radius",
+    "starting_angle",
+    "ending_angle",
+    "num_radial_elems",
+    "num_angular_elems",
+    "scale_x",
+    "scale_y",
+    "scale_z"
+};
+
+// ----------------------------------
+// required inputs for mesh options
+// ----------------------------------
+static std::vector<std::string> mesh_required_inps
+{
+    "source",
+    "num_dims",
+};
+
+#endif // end Header Guard
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 03fee676..0c82ba9d 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -1,139 +1,21 @@
-/**********************************************************************************************
-© 2020. Triad National Security, LLC. All rights reserved.
-This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
-National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
-Department of Energy/National Nuclear Security Administration. All rights in the program are
-reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
-Security Administration. The Government is granted for itself and others acting on its behalf a
-nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
-derivative works, distribute copies to the public, perform publicly and display publicly, and
-to permit others to do so.
-This program is open source under the BSD-3 License.
-Redistribution and use in source and binary forms, with or without modification, are permitted
-provided that the following conditions are met:
-1.  Redistributions of source code must retain the above copyright notice, this list of
-conditions and the following disclaimer.
-2.  Redistributions in binary form must reproduce the above copyright notice, this list of
-conditions and the following disclaimer in the documentation and/or other materials
-provided with the distribution.
-3.  Neither the name of the copyright holder nor the names of its contributors may be used
-to endorse or promote products derived from this software without specific prior
-written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************************/
-#ifndef FIERRO_IO_H
-#define FIERRO_IO_H
+#ifndef MESH_IO_H
+#define MESH_IO_H
 
 #include "matar.h"
 #include "mesh.h"
 #include "state.h"
-#include "simulation_parameters.h"
-#include "region.h"
-#include "string_utils.h"
 
-#include <map>
-#include <memory>
-#include <cstring>
-#include <sys/stat.h>
-#include <iostream>
-#include <regex>    // for string pattern recoginition
-#include <fstream>
-#include <sstream>
-#include <vector>
-#include <string>
+using namespace mtr;
 
 
 
-/////////////////////////////////////////////////////////////////////////////
-///
-/// \fn get_id
-///
-/// \brief This gives the index value of the point or the elem
-///
-/// Assumes that the grid has an i,j,k structure
-/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1)
-/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j
-///
-/// \param i index
-/// \param j index
-/// \param k index
-/// \param Number of i indices
-/// \param Number of j indices
-///
-/////////////////////////////////////////////////////////////////////////////
-inline int get_id(int i, int j, int k, int num_i, int num_j)
-{
-    return i + j * num_i + k * num_i * num_j;
-}
-
-/////////////////////////////////////////////////////////////////////////////
-///
-/// \fn PointIndexFromIJK
-///
-/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an 
-/// offset into the local connectivity (PointIds) array. The order parameter
-/// must point to an array of 3 integers specifying the order along each 
-/// axis of the hexahedron.
-///
-/////////////////////////////////////////////////////////////////////////////
-inline int PointIndexFromIJK(int i, int j, int k, const int* order)
-{
-    bool ibdy = (i == 0 || i == order[0]);
-    bool jbdy = (j == 0 || j == order[1]);
-    bool kbdy = (k == 0 || k == order[2]);
-    // How many boundaries do we lie on at once?
-    int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0);
 
-    if (nbdy == 3) { // Vertex DOF
-        // ijk is a corner node. Return the proper index (somewhere in [0,7]):
-        return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0);
-    }
 
-    int offset = 8;
-    if (nbdy == 2) { // Edge DOF
-        if (!ibdy) { // On i axis
-            return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
-        }
-        if (!jbdy) { // On j axis
-            return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
-        }
-        // !kbdy, On k axis
-        offset += 4 * (order[0] - 1) + 4 * (order[1] - 1);
-        return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset;
-    }
 
-    offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1);
-    if (nbdy == 1) { // Face DOF
-        if (ibdy) { // On i-normal face
-            return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset;
-        }
-        offset += 2 * (order[1] - 1) * (order[2] - 1);
-        if (jbdy) { // On j-normal face
-            return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset;
-        }
-        offset += 2 * (order[2] - 1) * (order[0] - 1);
-        // kbdy, On k-normal face
-        return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset;
-    }
-
-    // nbdy == 0: Body DOF
-    offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1));
-    return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1)));
-}
 
 /////////////////////////////////////////////////////////////////////////////
 ///
-/// \fn get_id_device
+/// \fn get_id
 ///
 /// \brief This gives the index value of the point or the elem
 ///
@@ -148,4747 +30,137 @@ inline int PointIndexFromIJK(int i, int j, int k, const int* order)
 /// \param Number of j indices
 ///
 /////////////////////////////////////////////////////////////////////////////
-KOKKOS_INLINE_FUNCTION
-int get_id_device(int i, int j, int k, int num_i, int num_j)
+inline int get_id(int i, int j, int k, int num_i, int num_j)
 {
     return i + j * num_i + k * num_i * num_j;
 }
 
-
-//-------
-// word is the field name e.g., Offsets, connectivity, etc.
-// stop is the phrase to stop extracting values
-template <typename T>
-inline bool extract_values_xml(T *values_xml,
-                        const std::string& word,
-                        const std::string& stop,
-                        std::ifstream& in,
-                        size_t& size)
-{
-
-        bool found = false;
-
-        std::string line;
-
-        size_t i = 0;
-
-        // Read the file line by line looking for specified word
-        while (std::getline(in, line)) {
-
-            if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line
-                found = true;
-            } 
-            if(found) {
-
-                // loop over the lines in the file, extracting the values of the field corresponding to the word
-                while (std::getline(in, line)){  
-                
-                    std::istringstream iss(line);  // Create a stream from the line
-
-                    // extract the individual values from the stream
-                    T value;
-                    while (iss >> value) {
-                        values_xml[i] = value;
-                        i++;
-                    } // end while
-
-                    if (line.find(stop) != std::string::npos) { // Check if the stop word is in the line
-                        break;
-                    } // end if
-
-                } // end while
-
-                if(found) break;
-
-            } // end if found
-
-        } // end while
-
-        size = i;
-
-        return found;
-
-} // end function
-
-
-// find the number of points and number of cells in the mesh
-inline bool extract_num_points_and_cells_xml(int& numberOfPoints,
-                                      int& numberOfCells,
-                                      std::ifstream& in)
-{
-    bool found = false;
-
-    std::string line;
-
-        
-    // Read the file line by line looking for NumberOfPoints
-    while (std::getline(in, line)) {
-        
-        std::string word = "NumberOfPoints=";  // A portion of a word
-
-        if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line
-            found = true;
-        }
-        if(found) {
-            // Define regex pattern to match the attributes and capture values
-            std::regex pattern(R"(NumberOfPoints=\"(\d+)\" NumberOfCells=\"(\d+)\")");
-            std::smatch match;
-
-            if (std::regex_search(line, match, pattern)) {
-                //std::cout << "Number of nodes in mesh file: " << match[1] << std::endl;
-                //std::cout << "Number of cells in mesh file: " << match[2] << std::endl;
-
-                numberOfPoints = std::stoi(match[1].str());
-                numberOfCells = std::stoi(match[2].str());
-
-            } else {
-                std::cout << "Error reading the number of points and cells in the mesh!" << std::endl;
-            }
-            
-            break;
-        } // end if
-        
-    } // end while
-
-    return found;
-
-} // end function
-
-
-//    8  = pixal i,j,k linear quad ording
-//    9  = linear quad ensight ordering
-//    11 = voxel i,j,k linear hex ording
-//    12 = linear ensight hex ordering
-//    72 = VTK_LAGRANGE_HEXAHEDRON
-namespace element_types
-{
-    enum element_name
-    {
-        linear_quad_ijk = 8,
-        linear_quad = 9,
-        linear_hex_ijk = 11,
-        linear_hex = 12,
-        arbitrary_hex = 72
-    };
-}
-
 /////////////////////////////////////////////////////////////////////////////
 ///
-/// \class MeshReader
+/// \fn build_3d_box
 ///
-/// \brief Class for simplifying reading meshes
+/// \brief Builds an unstructured 3D rectilinear mesh
 ///
-/// This class contains the requisite functions required to read different
-/// mesh formats. The idea is to set the mesh file name, and parse the
-/// extension to decide which reader to use. Currently, only ensight .geo
-/// files are supported.
+/// \param Simulation mesh that is built
+/// \param Element state data
+/// \param Node state data
+/// \param origin The origin of the mesh
+/// \param length The length of the mesh
+/// \param num_elems The number of elements in the mesh
 ///
 /////////////////////////////////////////////////////////////////////////////
-class MeshReader
+void build_3d_box(
+    Mesh_t& mesh,
+    GaussPoint_t& GaussPoints,
+    node_t&   node,
+    double origin[3],
+    double length[3],
+    int num_elems_dim[3])
 {
-private:
-    // Handy structs for parsing input meshes
-    struct Node {
-        int id;
-        double x, y, z;
-    };
-
-    struct Element {
-        int id;
-        std::vector<int> connectivity; 
-    };
-
-public:
-
-    char* mesh_file_ = NULL;
-
-    MeshReader() {} // Simulation_Parameters& _simparam);
-
-    ~MeshReader() = default;
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn set_mesh_file
-    ///
-    /// \brief Sets the mesh file path for reading in a mesh
-    ///
-    /// \param Path to mesh file
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void set_mesh_file(char* MESH)
-    {
-        mesh_file_ = MESH;
-    }
-
-    // Reads and initializes the mesh and geometric state entities
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn read_mesh
-    ///
-    /// \brief Read mesh from file
-    ///
-    /// \param Simulation mesh
-    /// \param Simulation state
-    /// \param Number of dimensions
-    ///
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void read_mesh(Mesh_t& mesh,
-                   State_t& State,
-                   mesh_input_t& mesh_inps,
-                   int      num_dims)
-    {
-        if (mesh_file_ == NULL) {
-            throw std::runtime_error("**** No mesh path given for read_mesh ****");
-        }
+    printf("Creating a 3D box mesh \n");
 
-        std::ifstream file(mesh_file_);
-        if (file.is_open()) {
-            std::cout << "The file exists." << std::endl;
-            file.close();
-        } else {
-            throw std::runtime_error("**** Mesh path given does not exists ****");
-        }
+    const int num_dim = 3;
 
-        // Check mesh file extension
-        // and read based on extension
-        std::string filePathStr(mesh_file_);
-        std::string extension;
+    // Note: In fierro, these come from the simulation parameters
+    const double lx = length[0];
+    const double ly = length[1];
+    const double lz = length[2];
 
-        size_t pos = filePathStr.rfind('.');
-        if (pos != std::string::npos) {
-            extension = filePathStr.substr(pos + 1);
-        } else {
-            extension =  "";
-        }
+    // Note: In fierro, these come from the simulation parameters
+    const int num_elems_i = num_elems_dim[0];
+    const int num_elems_j = num_elems_dim[1];
+    const int num_elems_k = num_elems_dim[2];
 
-        std::cout << "File extension is: " << extension << std::endl;
+    const int num_points_i = num_elems_i + 1; // num points in x
+    const int num_points_j = num_elems_j + 1; // num points in y
+    const int num_points_k = num_elems_k + 1; // num points in y
 
-        if(extension == "geo"){ // Ensight meshfile extension
-            read_ensight_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims);
-        }
-        else if(extension == "inp"){ // Abaqus meshfile extension
-            read_Abaqus_mesh(mesh, State, num_dims);
-        }
-        else if(extension == "vtk"){ // vtk file format
-            read_vtk_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims);
-        }
-        else if(extension == "vtu"){ // vtu file format
-            read_vtu_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims);
-        }
-        else{
-            throw std::runtime_error("**** Mesh file extension not understood ****");
-        }
+    const int num_nodes = num_points_i * num_points_j * num_points_k;
 
-    }
+    const double dx = lx / ((double)num_elems_i);  // len/(num_elems_i)
+    const double dy = ly / ((double)num_elems_j);  // len/(num_elems_j)
+    const double dz = lz / ((double)num_elems_k);  // len/(num_elems_k)
 
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn read_ensight_mesh
-    ///
-    /// \brief Read .geo mesh file
-    ///
-    /// \param Simulation mesh
-    /// \param Element state struct
-    /// \param Node state struct
-    /// \param Corner state struct
-    /// \param Number of dimensions
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void read_ensight_mesh(Mesh_t& mesh,
-                           GaussPoint_t& GaussPoints,
-                           node_t&   node,
-                           corner_t& corner,
-                           mesh_input_t& mesh_inps,
-                           int num_dims)
-    {
-        FILE* in;
-        char  ch;
+    const int num_elems = num_elems_i * num_elems_j * num_elems_k;
 
-        size_t num_nodes_in_elem = 1;
-        for (int dim = 0; dim < num_dims; dim++) {
-            num_nodes_in_elem *= 2;
-        }
+    // --- 3D parameters ---
+    // const int num_faces_in_elem  = 6;  // number of faces in elem
+    // const int num_points_in_elem = 8;  // number of points in elem
+    // const int num_points_in_face = 4;  // number of points in a face
+    // const int num_edges_in_elem  = 12; // number of edges in a elem
 
-        // read the mesh    WARNING: assumes a .geo file
-        in = fopen(mesh_file_, "r");
-
-        // skip 8 lines
-        for (int j = 1; j <= 8; j++) {
-            int i = 0;
-            while ((ch = (char)fgetc(in)) != '\n') {
-                i++;
-            }
-        }
-
-        // --- Read in the nodes in the mesh ---
-
-        size_t num_nodes = 0;
-
-        fscanf(in, "%lu", &num_nodes);
-        printf("Number of nodes read in %lu\n", num_nodes);
-
-        
-        mesh.initialize_nodes(num_nodes);
+    // initialize mesh node variables
+    mesh.initialize_nodes(num_nodes);
 
         // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
-        std::vector<node_state> required_node_state = { node_state::coords };
-        node.initialize(num_nodes, num_dims, required_node_state);
-
-        // read the initial mesh coordinates
-        // x-coords
-        for (int node_id = 0; node_id < mesh.num_nodes; node_id++) {
-            fscanf(in, "%le", &node.coords.host(node_id, 0));
-            node.coords.host(node_id, 0)*= mesh_inps.scale_x;
-        }
-
-        // y-coords
-        for (int node_id = 0; node_id < mesh.num_nodes; node_id++) {
-            fscanf(in, "%le", &node.coords.host(node_id, 1));
-            node.coords.host(node_id, 1)*= mesh_inps.scale_y;
-        }
-
-        // z-coords
-        for (int node_id = 0; node_id < mesh.num_nodes; node_id++) {
-            if (num_dims == 3) {
-                fscanf(in, "%le", &node.coords.host(node_id, 2));
-                node.coords.host(node_id, 2)*= mesh_inps.scale_z;
-            }
-            else{
-                double dummy;
-                fscanf(in, "%le", &dummy);
-            }
-        } // end for
-
-
-        // Update device nodal positions
-        node.coords.update_device();
-
-        ch = (char)fgetc(in);
-
-        // skip 1 line
-        for (int j = 1; j <= 1; j++) {
-            int i = 0;
-            while ((ch = (char)fgetc(in)) != '\n') {
-                i++;
-            }
-        }
-
-        // --- read in the elements in the mesh ---
-        size_t num_elem = 0;
-
-        fscanf(in, "%lu", &num_elem);
-        printf("Number of elements read in %lu\n", num_elem);
-
-        // initialize elem variables
-        mesh.initialize_elems(num_elem, num_dims);
-        // GaussPoints.initialize(num_elem, 3); // always 3D here, even for 2D
-
-        
-        // for each cell read the list of associated nodes
-        for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) {
-            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
-                fscanf(in, "%lu", &mesh.nodes_in_elem.host(elem_gid, node_lid));  // %d vs zu
-
-                // shift to start node index space at 0
-                mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1;
-            }
-        }
-
-        // Convert from ensight to IJK mesh
-        int convert_ensight_to_ijk[8];
-        convert_ensight_to_ijk[0] = 0;
-        convert_ensight_to_ijk[1] = 1;
-        convert_ensight_to_ijk[2] = 3;
-        convert_ensight_to_ijk[3] = 2;
-        convert_ensight_to_ijk[4] = 4;
-        convert_ensight_to_ijk[5] = 5;
-        convert_ensight_to_ijk[6] = 7;
-        convert_ensight_to_ijk[7] = 6;
-
-        int tmp_ijk_indx[8];
-
-        for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) {
-            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
-                tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]);
-            }
-
-            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){
-                mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid];
-            }
-        }
-        // update device side
-        mesh.nodes_in_elem.update_device();
-
-        // initialize corner variables
-        int num_corners = num_elem * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-        // corner.initialize(num_corners, num_dims);
-
-        // Close mesh input file
-        fclose(in);
-
-        // Build connectivity
-        mesh.build_connectivity();
-
-        return;
-    } // end read ensight mesh
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn read_Abaqus_mesh
-    ///
-    /// \brief Read .inp mesh file
-    ///
-    /// \param Simulation mesh
-    /// \param Simulation state
-    /// \param Node state struct
-    /// \param Number of dimensions
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void read_Abaqus_mesh(Mesh_t& mesh,
-                          State_t& State,
-                          int num_dims)
-    {
-
-        std::cout<<"Reading abaqus input file for mesh"<<std::endl;
-        std::ifstream inputFile(mesh_file_);
-        if (!inputFile.is_open()) {
-            std::cerr << "Failed to open the file." << std::endl;
-
-        }
-
-        std::vector<Node> nodes;
-        std::vector<Element> elements;
-
-        std::string line;
-        bool readingNodes = false;
-        bool readingElements = false;
-
-        while (std::getline(inputFile, line)) {
-            if (line.find("*Node") != std::string::npos) {
-                readingNodes = true;
-                std::cout<<"Found *Node"<<std::endl;
-
-            } 
-            else if (readingNodes && !line.find("*") ) { // End of nodes
-                readingNodes = false;
-            } 
-            else if (readingNodes) {
-                // std::cout<<"Reading Nodes"<<std::endl;
-                std::istringstream iss(line);
-                std::ws(iss); // Skip leading whitespace
-                std::string token;
-                Node node;
-
-                if (!(iss >> node.id && std::getline(iss, token, ',') && iss >> node.x &&
-                    std::getline(iss, token, ',') && iss >> node.y &&
-                    std::getline(iss, token, ',') && iss >> node.z)) {
-                    std::cerr << "Failed to parse line: " << line << std::endl;
-                    continue; // Skip this line if parsing failed
-                }
-                nodes.push_back(node);
-            }
-
-            if (line.find("*Element") != std::string::npos) {
-                readingElements = true;
-                std::cout<<"Found *Element*"<<std::endl;
-            } 
-            else if (readingElements &&  !line.find("*") ) { // End of elements
-                readingElements = false;
-            } 
-            else if (readingElements ) {
-                std::istringstream iss(line);
-                Element element;
-                std::string token;
-
-                if (!(iss >> element.id)){
-                    std::cout << "Failed to parse line: " << line << std::endl;
-                    continue; // Skip this line if parsing failed
-                } 
-
-                while ((std::getline(iss, token, ','))) { 
-                    // Now extract the integer, ignoring any trailing whitespace
-                    int val;
-                    iss >> val;
-                    element.connectivity.push_back(val);
-                }
-
-                // Convert from abaqus to IJK mesh
-                int convert_abq_to_ijk[8];
-                convert_abq_to_ijk[0] = 0;
-                convert_abq_to_ijk[1] = 1;
-                convert_abq_to_ijk[2] = 3;
-                convert_abq_to_ijk[3] = 2;
-                convert_abq_to_ijk[4] = 4;
-                convert_abq_to_ijk[5] = 5;
-                convert_abq_to_ijk[6] = 7;
-                convert_abq_to_ijk[7] = 6;
-
-                int tmp_ijk_indx[8];
-
-                for (int node_lid = 0; node_lid < 8; node_lid++) {
-                    tmp_ijk_indx[node_lid] = element.connectivity[convert_abq_to_ijk[node_lid]];
-                }
-
-                for (int node_lid = 0; node_lid < 8; node_lid++){
-                    element.connectivity[node_lid] = tmp_ijk_indx[node_lid];
-                }
-
-                elements.push_back(element);
-            }
-        }
-
-        inputFile.close();
-
-        size_t num_nodes = nodes.size();
-
-        printf("Number of nodes read in %lu\n", num_nodes);
-
-        // initialize node variables
-        mesh.initialize_nodes(num_nodes);
-
-        // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers
-        std::vector<node_state> required_node_state = { node_state::coords };
-
-        State.node.initialize(num_nodes, num_dims, required_node_state);
-
-
-        // Copy nodes to mesh
-        for(int node_gid = 0; node_gid < num_nodes; node_gid++){
-            State.node.coords.host(node_gid, 0) = nodes[node_gid].x;
-            State.node.coords.host(node_gid, 1) = nodes[node_gid].y;
-            State.node.coords.host(node_gid, 2) = nodes[node_gid].z;
-        }
-
-        // Update device nodal positions
-        State.node.coords.update_device();
-
-
-        // --- read in the elements in the mesh ---
-        size_t num_elem = elements.size();
-        printf("Number of elements read in %lu\n", num_elem);
-
-        // initialize elem variables
-        mesh.initialize_elems(num_elem, num_dims);
-
-
-        // for each cell read the list of associated nodes
-        for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) {
-            for (int node_lid = 0; node_lid < 8; node_lid++) {
-                mesh.nodes_in_elem.host(elem_gid, node_lid) = elements[elem_gid].connectivity[node_lid];
-
-                // shift to start node index space at 0
-                mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1;
-            }
-        }
+    std::vector<node_state> required_node_state = { node_state::coords };
+    node.initialize(num_nodes, num_dim, required_node_state);
 
-        // update device side
-        mesh.nodes_in_elem.update_device();
+    // --- Build nodes ---
 
-        // initialize corner variables
-        int num_corners = num_elem * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-        // State.corner.initialize(num_corners, num_dims);
-
-        // Build connectivity
-        mesh.build_connectivity();
-    } // end read abaqus mesh
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn read_vtk_mesh
-    ///
-    /// \brief Read ASCII .vtk mesh file
-    ///
-    /// \param Simulation mesh
-    /// \param Simulation state
-    /// \param Node state struct
-    /// \param Number of dimensions
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void read_vtk_mesh(Mesh_t& mesh,
-                    GaussPoint_t& GaussPoints,
-                    node_t&   node,
-                    corner_t& corner,
-                    mesh_input_t& mesh_inps,
-                    int num_dims)
-    {
-
-        std::cout<<"Reading VTK mesh"<<std::endl;
-    
-        int i;           // used for writing information to file
-        int node_gid;    // the global id for the point
-        int elem_gid;     // the global id for the elem
-
-        size_t num_nodes_in_elem = 1;
-        for (int dim = 0; dim < num_dims; dim++) {
-            num_nodes_in_elem *= 2;
-        }
-        
-
-        std::string token;
-        
-        bool found = false;
-        
-        std::ifstream in;  // FILE *in;
-        in.open(mesh_file_);
-        
-
-        // look for POINTS
-        i = 0;
-        while (found==false) {
-            std::string str;
-            std::string delimiter = " ";
-            std::getline(in, str);
-            std::vector<std::string> v = split (str, delimiter);
-            
-            // looking for the following text:
-            //      POINTS %d float
-            if(v[0] == "POINTS"){
-                size_t num_nodes = std::stoi(v[1]);
-                printf("Number of nodes read in %zu\n", num_nodes);
-                mesh.initialize_nodes(num_nodes);
-
-                std::vector<node_state> required_node_state = { node_state::coords };
-                node.initialize(num_nodes, num_dims, required_node_state);
-                
-                found=true;
-            } // end if
-            
-            
-            if (i>1000){
-                std::cerr << "ERROR: Failed to find POINTS in file" << std::endl;
-                break;
-            } // end if
-            
-            i++;
-        } // end while
-        
-        // read the node coordinates
-        for (node_gid=0; node_gid<mesh.num_nodes; node_gid++){
-            
-            std::string str;
-            std::getline(in, str);
-            
-            std::string delimiter = " ";
-            std::vector<std::string> v = split (str, delimiter);
-            
-            // save the nodal coordinates
-            node.coords.host(node_gid, 0) = mesh_inps.scale_x*std::stod(v[0]); // double
-            node.coords.host(node_gid, 1) = mesh_inps.scale_y*std::stod(v[1]); // double
-            if(num_dims==3){
-                node.coords.host(node_gid, 2) = mesh_inps.scale_z*std::stod(v[2]); // double
-            }
-            
-        } // end for nodes
-
-
-        // Update device nodal positions
-        node.coords.update_device();
-        
-
-        found=false;
-
-        // look for CELLS
-        i = 0;
-        size_t num_elem = 0;
-        while (found==false) {
-            std::string str;
-            std::getline(in, str);
-            
-            std::string delimiter = " ";
-            std::vector<std::string> v = split (str, delimiter);
-            std::cout << v[0] << std::endl; // printing
-            
-            // looking for the following text:
-            //      CELLS num_elem size
-            if(v[0] == "CELLS"){
-                num_elem = std::stoi(v[1]);
-                printf("Number of elements read in %zu\n", num_elem);
-
-                // initialize elem variables
-                mesh.initialize_elems(num_elem, num_dims);
-                
-                found=true;
-            } // end if
-            
-            
-            if (i>1000){
-                printf("ERROR: Failed to find CELLS \n");
-                break;
-            } // end if
-            
-            i++;
-        } // end while
-        
-        
-        // read the node ids in the element
-        for (elem_gid=0; elem_gid<num_elem; elem_gid++) {
-            
-            std::string str;
-            std::getline(in, str);
-            
-            std::string delimiter = " ";
-            std::vector<std::string> v = split (str, delimiter);
-            num_nodes_in_elem = std::stoi(v[0]);
-            
-            for (size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++){
-                mesh.nodes_in_elem.host(elem_gid, node_lid) = std::stod(v[node_lid+1]);
-                //printf(" %zu ", elem_point_list(elem_gid,node_lid) ); // printing
-            }
-            //printf("\n"); // printing
-            
-        } // end for
-
-        // Convert from ensight to IJK mesh
-        size_t convert_ensight_to_ijk[8];
-        convert_ensight_to_ijk[0] = 0;
-        convert_ensight_to_ijk[1] = 1;
-        convert_ensight_to_ijk[2] = 3;
-        convert_ensight_to_ijk[3] = 2;
-        convert_ensight_to_ijk[4] = 4;
-        convert_ensight_to_ijk[5] = 5;
-        convert_ensight_to_ijk[6] = 7;
-        convert_ensight_to_ijk[7] = 6;
-
-        size_t tmp_ijk_indx[8];
-
-        for (size_t elem_gid = 0; elem_gid < num_elem; elem_gid++) {
-            for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
-                tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]);
-            }
-
-            for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){
-                mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid];
-            }
-        }
-        // update device side
-        mesh.nodes_in_elem.update_device();
-
-
-        // initialize corner variables
-        size_t num_corners = num_elem * num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-
-
-        // Build connectivity
-        mesh.build_connectivity();
-
-
-        found=false;
-
-        printf("\n");
-        
-        
-        // look for CELL_TYPE
-        i = 0;
-        size_t elem_type = 0;
-        while (found==false) {
-            std::string str;
-            std::string delimiter = " ";
-            std::getline(in, str);
-            std::vector<std::string> v = split (str, delimiter);
-            
-            // looking for the following text:
-            //      CELLS num_elem size
-            if(v[0] == "CELL_TYPES"){
-
-                std::getline(in, str);
-                elem_type = std::stoi(str);
-                
-                found=true;
-            } // end if
-            
-            
-            if (i>1000){
-                printf("ERROR: Failed to find elem_TYPE \n");
-                break;
-            } // end if
-            
-            i++;
-        } // end while
-        printf("Element type = %zu \n", elem_type);
-        // elem types:
-        // linear hex = 12, linear quad = 9
-        found=false;
-        
-        
-        if(num_nodes_in_elem==8 & elem_type != 12) {
-            printf("Wrong element type of %zu \n", elem_type);
-            std::cerr << "ERROR: incorrect element type in VTK file" << std::endl;
-        }
-        
-        in.close();
-        
-    } // end of VTKread function
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn read_vtu_mesh
-    ///
-    /// \brief Read ASCII .vtu mesh file
-    ///
-    /// \param Simulation mesh
-    /// \param Simulation state
-    /// \param Node state struct
-    /// \param Number of dimensions
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void read_vtu_mesh(Mesh_t& mesh,
-                    GaussPoint_t& GaussPoints,
-                    node_t&   node,
-                    corner_t& corner,
-                    mesh_input_t& mesh_inps,
-                    int num_dims)
-    {
-
-        std::cout<<"Reading VTU file in a multiblock VTK mesh"<<std::endl;
-    
-        int i;           // used for writing information to file
-        int node_gid;    // the global id for the point
-        int elem_gid;    // the global id for the elem
-
-
-        //
-        int Pn_order = mesh_inps.p_order;
-        size_t num_nodes_in_elem = 1;
-        for (int dim = 0; dim < num_dims; dim++) {
-            num_nodes_in_elem *= (Pn_order + 1);
-        }
-        
-        bool found;
-        
-        std::ifstream in;  // FILE *in;
-        in.open(mesh_file_);
-        
-
-        // --- extract the number of points and cells from the XML file ---
-        int num_nodes;
-        int num_elems;
-        found = extract_num_points_and_cells_xml(num_nodes,
-                                                 num_elems,
-                                                 in);
-        if(found==false){
-            throw std::runtime_error("ERROR: number of points and/or cells not found in the XML file!");
-            //std::cout << "ERROR: number of points and cells not found in the XML file!" << std::endl;
-        }
-        std::cout << "Number of nodes in the mesh file: " << num_nodes << std::endl;
-        std::cout << "Number of elements in the mesh file: " << num_elems << std::endl;
-        
-        //------------------------------------
-        // allocate mesh class nodes and elems
-        mesh.initialize_nodes(num_nodes);
-        mesh.initialize_elems(num_elems, num_dims);
-
-        //------------------------------------
-        // allocate node coordinate state
-        std::vector<node_state> required_node_state = { node_state::coords };
-        node.initialize(num_nodes, num_dims, required_node_state);
-
-        //------------------------------------
-        // allocate the elem object id array
-        mesh_inps.object_ids = DCArrayKokkos <int> (num_elems, "ObjectIDs");
-
-
-        // ------------------------
-        // Mesh file storage order:
-        //     objectId
-        //     Points
-        //     connectivity
-        //     offsets
-        //     types
-        // ------------------------
-        
-        // temporary arrays
-        DCArrayKokkos<double> node_coords(num_nodes,3, "node_coords_vtu_file"); // always 3 with vtu files
-        DCArrayKokkos<int> connectivity(num_elems,num_nodes_in_elem, "connectivity_vtu_file");
-        DCArrayKokkos<int> elem_types(num_elems, "elem_types_vtu_file"); // element types
-
-
-        // for all fields, we stop recording when we get to "<"
-        std::string stop = "<";
-
-        // the size of 1D storage from reading the mesh file
-        size_t size;
-
-        // ---
-        //  Object ids
-        // ---
-
-        // the object id in the element
-        // array dims are (num_elems)
-        found = extract_values_xml(mesh_inps.object_ids.host.pointer(),
-                                "\"ObjectId\"",
-                                stop,
-                                in,
-                                size);
-        if(found==false){
-            throw std::runtime_error("ERROR: ObjectIDs were not found in the XML file!");
-            //std::cout << "ERROR: ObjectIDs were not found in the XML file!" << std::endl;
-        }
-        mesh_inps.object_ids.update_device();
-
-
-        // ---
-        //  Nodal coordinates of mesh
-        // ---
-
-        // coordinates of the node
-        // array dims are (num_nodes,dims)
-        // must use the quotes around Points to read the point values
-        found = extract_values_xml(node_coords.host.pointer(),
-                                "\"Points\"",
-                                stop,
-                                in,
-                                size);
-        if(found==false){
-            throw std::runtime_error("**** ERROR: mesh nodes were not found in the XML file! ****");
-            //std::cout << "ERROR: mesh nodes were not found in the XML file!" << std::endl;
-        }
-        if (size!=num_nodes*3){
-            throw std::runtime_error("ERROR: failed to read all the mesh nodes!");
-            //std::cout << "ERROR: failed to read all the mesh nodes!" << std::endl;
-        }
-        node_coords.update_device();
-
-        // dimensional scaling of the mesh
-        const double scl_x = mesh_inps.scale_x;
-        const double scl_y = mesh_inps.scale_y;
-        const double scl_z = mesh_inps.scale_z;
-
-        // save the node coordinates to the state array
-        FOR_ALL(node_gid, 0, mesh.num_nodes, {
-            
-            // save the nodal coordinates
-            node.coords(node_gid, 0) = scl_x*node_coords(node_gid, 0); // double
-            node.coords(node_gid, 1) = scl_y*node_coords(node_gid, 1); // double
-            if(num_dims==3){
-                node.coords(node_gid, 2) = scl_z*node_coords(node_gid, 2); // double
-            }
-
-        }); // end for parallel nodes
-        node.coords.update_host();
-
-
-        // ---
-        //  Nodes in the element 
-        // ---
-
-        // fill temporary nodes in the element array
-        // array dims are (num_elems,num_nodes_in_elem)
-        found = extract_values_xml(connectivity.host.pointer(),
-                                "\"connectivity\"",
-                                stop,
-                                in,
-                                size);
-        if(found==false){
-            std::cout << "ERROR: mesh connectivity was not found in the XML file!" << std::endl;
-        }
-        connectivity.update_device();
-
-        // array dims are the (num_elems) 
-        //    8  = pixal i,j,k linear quad format
-        //    9  = linear quad ensight ordering
-        //    12 = linear ensight hex ordering
-        //    72 = VTK_LAGRANGE_HEXAHEDRON
-        // ....
-        found = extract_values_xml(elem_types.host.pointer(),
-                                "\"types\"",
-                                stop,
-                                in,
-                                size);
-        if(found==false){
-            std::cout << "ERROR: element types were not found in the XML file!" << std::endl;
-        }
-        elem_types.update_device();
-
-        // check that the element type is supported by Fierro
-        FOR_ALL (elem_gid, 0, mesh.num_elems, {
-            if(elem_types(elem_gid) == element_types::linear_quad || 
-               elem_types(elem_gid) == element_types::linear_hex_ijk ||
-               elem_types(elem_gid) == element_types::linear_hex ||
-               elem_types(elem_gid) == element_types::arbitrary_hex )
-            {
-                // at least one of them is true
-            }
-            else 
-            {
-               // unknown element used
-               Kokkos::abort("Unknown element type in the mesh \n");
-            }
-        });
-
-        // Convert from ensight linear hex to a IJK mesh
-        CArrayKokkos <size_t> convert_ensight_to_ijk(8, "convert_ensight_to_ijk");
-
-        // Convert the arbitrary order hex to a IJK mesh
-        DCArrayKokkos <size_t> convert_pn_vtk_to_ijk(mesh.num_nodes_in_elem, "convert_pn_vtk_to_ijk");
-
-        //build the connectivity for element type 12
-        // elem_types.host(0)
-        switch(elem_types.host(0)){
-
-            case element_types::linear_quad:
-                // the node order is correct, no changes required
-
-                FOR_ALL (elem_gid, 0, mesh.num_elems, {
-                    
-                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
-                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,node_lid);
-                    }
-                    
-                }); // end for
-
-                break;
-                // next case
-
-            case element_types::linear_hex_ijk:
-
-                // read the node ids in the element, no maps required
-                FOR_ALL (elem_gid, 0, mesh.num_elems, {
-                    
-                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
-                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,node_lid);
-                    }
-                    
-                }); // end for
-
-                break;
-                // next case
-
-            case element_types::linear_hex:
-
-                RUN({
-                    convert_ensight_to_ijk(0) = 0;
-                    convert_ensight_to_ijk(1) = 1;
-                    convert_ensight_to_ijk(2) = 3;
-                    convert_ensight_to_ijk(3) = 2;
-                    convert_ensight_to_ijk(4) = 4;
-                    convert_ensight_to_ijk(5) = 5;
-                    convert_ensight_to_ijk(6) = 7;
-                    convert_ensight_to_ijk(7) = 6;
-                });
-
-                // read the node ids in the element
-                FOR_ALL (elem_gid, 0, mesh.num_elems, {
-                    
-                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
-                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,convert_ensight_to_ijk(node_lid));
-                    }
-                    
-                }); // end for
-
-                break;
-                // next case
-
-            case element_types::arbitrary_hex:
-
-                // re-order the nodes to be in i,j,k format for Fierro
-                size_t this_node = 0;
-                for (int k=0; k<=Pn_order; k++){
-                    for (int j=0; j<=Pn_order; j++){
-                        for (int i=0; i<=Pn_order; i++){
-                            
-                            // convert this_node index to the FE index convention
-                            int order[3] = {Pn_order, Pn_order, Pn_order};
-                            int this_index = PointIndexFromIJK(i, j, k, order);
-                            
-                            // store the points in this elem according the the finite
-                            // element numbering convention
-                            convert_pn_vtk_to_ijk.host(this_index) = this_node;
-                            
-                            // increment the point counting index
-                            this_node = this_node + 1;
-                            
-                        } // end for icount
-                    } // end for jcount
-                }  // end for kcount
-                convert_pn_vtk_to_ijk.update_device();
-                Kokkos::fence();
-
-                // read the node ids in the element
-                FOR_ALL (elem_gid, 0, mesh.num_elems, {
-                    
-                    for (size_t node_lid=0; node_lid<mesh.num_nodes_in_elem; node_lid++){
-                        mesh.nodes_in_elem(elem_gid, node_lid) = connectivity(elem_gid,convert_pn_vtk_to_ijk(node_lid));
-                    }
-                    
-                }); // end for
-
-                break;
-                // next case
-
-        } // end switch
-        mesh.nodes_in_elem.update_host();
-
-
-        // initialize corner variables
-        size_t num_corners = mesh.num_elems * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-
-
-        // Build connectivity
-        mesh.build_connectivity();
-
-
-        in.close();
-            
-    } // end of VTMread function
-
-
-}; // end of Mesh reader class
-
-/////////////////////////////////////////////////////////////////////////////
-///
-/// \class MeshBuilder
-///
-/// \brief Class for building simple meshes
-///
-/// This class contains the requisite functions required to build simple
-/// 2D and 3D Box meshes as well as 2D polar meshes. It uses the parsed
-/// simulation parameters to decide what type of mesh to build.
-///
-/////////////////////////////////////////////////////////////////////////////
-class MeshBuilder
-{
-public:
-
-    MeshBuilder() {}
-
-    ~MeshBuilder()
-    {
-    }
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn build_mesh
-    ///
-    /// \brief Build a mesh for Fierro based on the input instructions
-    ///
-    /// \param Simulation mesh that is built
-    /// \param Element state data
-    /// \param Node state data
-    /// \param Corner state data
-    /// \param Simulation parameters
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void build_mesh(Mesh_t& mesh,
-        GaussPoint_t& GaussPoints,
-        node_t&   node,
-        corner_t& corner,
-        SimulationParameters_t& SimulationParamaters)
-    {
-        if (SimulationParamaters.mesh_input.num_dims == 2) {
-            if (SimulationParamaters.mesh_input.type == mesh_input::Polar) {
-                build_2d_polar(mesh, GaussPoints, node, corner, SimulationParamaters);
-            }
-            else if (SimulationParamaters.mesh_input.type == mesh_input::Box) {
-                build_2d_box(mesh, GaussPoints, node, corner, SimulationParamaters);
-            }
-            else{
-                std::cout << "**** 2D MESH TYPE NOT SUPPORTED **** " << std::endl;
-                std::cout << "Valid options are: " << std::endl;
-                auto map = mesh_input_type_map;
-                for (const auto& pair : map) {
-                    std::cout << "\t" << pair.first << std::endl;
-                }
-                throw std::runtime_error("**** 2D MESH TYPE NOT SUPPORTED ****");
-            }
-        }
-        else if (SimulationParamaters.mesh_input.num_dims == 3) {
-            build_3d_box(mesh, GaussPoints, node, corner, SimulationParamaters);
-        }
-        else{
-            throw std::runtime_error("**** ONLY 2D RZ OR 3D MESHES ARE SUPPORTED ****");
-        }
-    }
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn build_2d_box
-    ///
-    /// \brief Builds an unstructured 2D rectilinear mesh
-    ///
-    /// \param Simulation mesh that is built
-    /// \param Element state data
-    /// \param Node state data
-    /// \param Corner state data
-    /// \param Simulation parameters
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void build_2d_box(Mesh_t& mesh,
-        GaussPoint_t& GaussPoints,
-        node_t&   node,
-        corner_t& corner,
-        SimulationParameters_t& SimulationParamaters) const
-    {
-        printf("Creating a 2D box mesh \n");
-
-        const int num_dim = 2;
-
-        const double lx = SimulationParamaters.mesh_input.length[0];
-        const double ly = SimulationParamaters.mesh_input.length[1];
-
-        const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0];
-        const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1];
-
-        const int num_points_i = num_elems_i + 1; // num points in x
-        const int num_points_j = num_elems_j + 1; // num points in y
-
-        const int num_nodes = num_points_i * num_points_j;
-
-        const double dx = lx / ((double)num_elems_i);  // len/(num_elems_i)
-        const double dy = ly / ((double)num_elems_j);  // len/(num_elems_j)
-
-        const int num_elems = num_elems_i * num_elems_j;
-
-        std::vector<double> origin(num_dim);
-        // SimulationParamaters.mesh_input.origin.update_host();
-        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
-
-        // --- 2D parameters ---
-        // const int num_faces_in_elem  = 4;  // number of faces in elem
-        // const int num_points_in_elem = 4;  // number of points in elem
-        // const int num_points_in_face = 2;  // number of points in a face
-        // const int num_edges_in_elem  = 4;  // number of edges in a elem
-
-        // --- mesh node ordering ---
-        // Convert ijk index system to the finite element numbering convention
-        // for vertices in elem
-        auto convert_point_number_in_quad = CArray<int>(4);
-        convert_point_number_in_quad(0) = 0;
-        convert_point_number_in_quad(1) = 1;
-        convert_point_number_in_quad(2) = 3;
-        convert_point_number_in_quad(3) = 2;
-
-        // intialize node variables
-        mesh.initialize_nodes(num_nodes);
-
-        // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers
-        std::vector<node_state> required_node_state = { node_state::coords };
-        node.initialize(num_nodes, num_dim, required_node_state);
-
-        // --- Build nodes ---
-
-        // populate the point data structures
+    // populate the point data structures
+    for (int k = 0; k < num_points_k; k++) {
         for (int j = 0; j < num_points_j; j++) {
             for (int i = 0; i < num_points_i; i++) {
                 // global id for the point
-                int node_gid = get_id(i, j, 0, num_points_i, num_points_j);
+                int node_gid = get_id(i, j, k, num_points_i, num_points_j);
 
                 // store the point coordinates
                 node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
                 node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
+                node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
             } // end for i
         } // end for j
+    } // end for k
+
 
+    node.coords.update_device();
 
-        node.coords.update_device();
+    // initialize elem variables
+    mesh.initialize_elems(num_elems, num_dim);
 
-        // initialize elem variables
-        mesh.initialize_elems(num_elems, num_dim);
+    // --- Build elems  ---
 
-        // populate the elem center data structures
+    // populate the elem center data structures
+    for (int k = 0; k < num_elems_k; k++) {
         for (int j = 0; j < num_elems_j; j++) {
             for (int i = 0; i < num_elems_i; i++) {
                 // global id for the elem
-                int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j);
+                int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
 
                 // store the point IDs for this elem where the range is
-                // (i:i+1, j:j+1 for a linear quad
+                // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
                 int this_point = 0;
+                for (int kcount = k; kcount <= k + 1; kcount++) {
+                    for (int jcount = j; jcount <= j + 1; jcount++) {
+                        for (int icount = i; icount <= i + 1; icount++) {
+                            // global id for the points
+                            int node_gid = get_id(icount, jcount, kcount,
+                                                num_points_i, num_points_j);
 
-                for (int jcount = j; jcount <= j + 1; jcount++) {
-                    for (int icount = i; icount <= i + 1; icount++) {
-                        // global id for the points
-                        int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j);
+                            // convert this_point index to the FE index convention
+                            int this_index = this_point; //convert_point_number_in_Hex(this_point);
 
-                        // convert this_point index to the FE index convention
-                        int this_index = convert_point_number_in_quad(this_point);
-
-                        // store the points in this elem according the the finite
-                        // element numbering convention
-                        mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+                            // store the points in this elem according the the finite
+                            // element numbering convention
+                            mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
 
-                        // increment the point counting index
-                        this_point = this_point + 1;
-                    } // end for icount
-                } // end for jcount
+                            // increment the point counting index
+                            this_point = this_point + 1;
+                        } // end for icount
+                    } // end for jcount
+                }  // end for kcount
             } // end for i
         } // end for j
+    } // end for k
 
-        // update device side
-        mesh.nodes_in_elem.update_device();
-
-        // intialize corner variables
-        int num_corners = num_elems * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-        // corner.initialize(num_corners, num_dim);
-
-        // Build connectivity
-        mesh.build_connectivity();
-    } // end build_2d_box
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn build_2d_polar
-    ///
-    /// \brief Builds an unstructured 2D polar mesh
-    ///
-    /// \param Simulation mesh that is built
-    /// \param Element state data
-    /// \param Node state data
-    /// \param Corner state data
-    /// \param Simulation parameters
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void build_2d_polar(Mesh_t& mesh,
-        GaussPoint_t& GaussPoints,
-        node_t&   node,
-        corner_t& corner,
-        SimulationParameters_t& SimulationParamaters) const
-    {
-        printf("Creating a 2D polar mesh \n");
-
-        int num_dim     = 2;
-
-        const double inner_radius = SimulationParamaters.mesh_input.inner_radius;
-        const double outer_radius = SimulationParamaters.mesh_input.outer_radius;
-
-        const double start_angle = PI / 180.0 * SimulationParamaters.mesh_input.starting_angle;
-        const double end_angle   = PI / 180.0 * SimulationParamaters.mesh_input.ending_angle;
-
-        const int num_elems_i = SimulationParamaters.mesh_input.num_radial_elems;
-        const int num_elems_j = SimulationParamaters.mesh_input.num_angular_elems;
-
-        const int num_points_i = num_elems_i + 1; // num points in x
-        const int num_points_j = num_elems_j + 1; // num points in y
-
-        const int num_nodes = num_points_i * num_points_j;
-
-        const double dx = (outer_radius - inner_radius) / ((double)num_elems_i);  // len/(elems)
-        const double dy = (end_angle - start_angle) / ((double)num_elems_j);  // len/(elems)
-
-        const int num_elems = num_elems_i * num_elems_j;
-
-        std::vector<double> origin(num_dim);
-
-        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
-
-        // --- 2D parameters ---
-        // const int num_faces_in_elem  = 4;  // number of faces in elem
-        // const int num_points_in_elem = 4;  // number of points in elem
-        // const int num_points_in_face = 2;  // number of points in a face
-        // const int num_edges_in_elem  = 4;  // number of edges in a elem
-
-        // --- mesh node ordering ---
-        // Convert ijk index system to the finite element numbering convention
-        // for vertices in elem
-        auto convert_point_number_in_quad = CArray<int>(4);
-        convert_point_number_in_quad(0) = 0;
-        convert_point_number_in_quad(1) = 1;
-        convert_point_number_in_quad(2) = 3;
-        convert_point_number_in_quad(3) = 2;
-
-        // intialize node variables
-        mesh.initialize_nodes(num_nodes);
-
-        // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers
-        std::vector<node_state> required_node_state = { node_state::coords };
-        node.initialize(num_nodes, num_dim, required_node_state);
-
-        // populate the point data structures
-        for (int j = 0; j < num_points_j; j++) {
-            for (int i = 0; i < num_points_i; i++) {
-                // global id for the point
-                int node_gid = get_id(i, j, 0, num_points_i, num_points_j);
-
-                double r_i     = inner_radius + (double)i * dx;
-                double theta_j = start_angle + (double)j * dy;
-
-                // store the point coordinates
-                node.coords.host(node_gid, 0) = origin[0] + r_i * cos(theta_j);
-                node.coords.host(node_gid, 1) = origin[1] + r_i * sin(theta_j);
-
-                if(node.coords.host(node_gid, 0) < 0.0){
-                    throw std::runtime_error("**** NODE RADIUS FOR RZ MESH MUST BE POSITIVE ****");
-                }
-
-            } // end for i
-        } // end for j
-
-
-        node.coords.update_device();
-
-        // initialize elem variables
-        mesh.initialize_elems(num_elems, num_dim);
-
-        // populate the elem center data structures
-        for (int j = 0; j < num_elems_j; j++) {
-            for (int i = 0; i < num_elems_i; i++) {
-                // global id for the elem
-                int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j);
-
-                // store the point IDs for this elem where the range is
-                // (i:i+1, j:j+1 for a linear quad
-                int this_point = 0;
-
-                for (int jcount = j; jcount <= j + 1; jcount++) {
-                    for (int icount = i; icount <= i + 1; icount++) {
-                        // global id for the points
-                        int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j);
-
-                        // convert this_point index to the FE index convention
-                        int this_index = convert_point_number_in_quad(this_point);
-
-                        // store the points in this elem according the the finite
-                        // element numbering convention
-                        mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
-
-                        // increment the point counting index
-                        this_point = this_point + 1;
-                    } // end for icount
-                } // end for jcount
-            } // end for i
-        } // end for j
-
-        // update device side
-        mesh.nodes_in_elem.update_device();
-
-        // intialize corner variables
-        int num_corners = num_elems * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-        // corner.initialize(num_corners, num_dim);
-
-        // Build connectivity
-        mesh.build_connectivity();
-    } // end build_2d_box
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn build_3d_box
-    ///
-    /// \brief Builds an unstructured 3D rectilinear mesh
-    ///
-    /// \param Simulation mesh that is built
-    /// \param Element state data
-    /// \param Node state data
-    /// \param Corner state data
-    /// \param Simulation parameters
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void build_3d_box(Mesh_t& mesh,
-        GaussPoint_t& GaussPoints,
-        node_t&   node,
-        corner_t& corner,
-        SimulationParameters_t& SimulationParamaters) const
-    {
-        printf("Creating a 3D box mesh \n");
-
-        const int num_dim = 3;
-
-        // SimulationParamaters.mesh_input.length.update_host();
-        const double lx = SimulationParamaters.mesh_input.length[0];
-        const double ly = SimulationParamaters.mesh_input.length[1];
-        const double lz = SimulationParamaters.mesh_input.length[2];
-
-        // SimulationParamaters.mesh_input.num_elems.update_host();
-        const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0];
-        const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1];
-        const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2];
-
-        const int num_points_i = num_elems_i + 1; // num points in x
-        const int num_points_j = num_elems_j + 1; // num points in y
-        const int num_points_k = num_elems_k + 1; // num points in y
-
-        const int num_nodes = num_points_i * num_points_j * num_points_k;
-
-        const double dx = lx / ((double)num_elems_i);  // len/(num_elems_i)
-        const double dy = ly / ((double)num_elems_j);  // len/(num_elems_j)
-        const double dz = lz / ((double)num_elems_k);  // len/(num_elems_k)
-
-        const int num_elems = num_elems_i * num_elems_j * num_elems_k;
-
-        std::vector<double> origin(num_dim);
-        // SimulationParamaters.mesh_input.origin.update_host();
-        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
-
-        // --- 3D parameters ---
-        // const int num_faces_in_elem  = 6;  // number of faces in elem
-        // const int num_points_in_elem = 8;  // number of points in elem
-        // const int num_points_in_face = 4;  // number of points in a face
-        // const int num_edges_in_elem  = 12; // number of edges in a elem
-
-
-        // initialize mesh node variables
-        mesh.initialize_nodes(num_nodes);
-
-         // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
-        std::vector<node_state> required_node_state = { node_state::coords };
-        node.initialize(num_nodes, num_dim, required_node_state);
-
-        // --- Build nodes ---
-
-        // populate the point data structures
-        for (int k = 0; k < num_points_k; k++) {
-            for (int j = 0; j < num_points_j; j++) {
-                for (int i = 0; i < num_points_i; i++) {
-                    // global id for the point
-                    int node_gid = get_id(i, j, k, num_points_i, num_points_j);
-
-                    // store the point coordinates
-                    node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
-                    node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
-                    node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
-                } // end for i
-            } // end for j
-        } // end for k
-
-
-        node.coords.update_device();
-
-        // initialize elem variables
-        mesh.initialize_elems(num_elems, num_dim);
-
-        // --- Build elems  ---
-
-        // populate the elem center data structures
-        for (int k = 0; k < num_elems_k; k++) {
-            for (int j = 0; j < num_elems_j; j++) {
-                for (int i = 0; i < num_elems_i; i++) {
-                    // global id for the elem
-                    int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
-
-                    // store the point IDs for this elem where the range is
-                    // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
-                    int this_point = 0;
-                    for (int kcount = k; kcount <= k + 1; kcount++) {
-                        for (int jcount = j; jcount <= j + 1; jcount++) {
-                            for (int icount = i; icount <= i + 1; icount++) {
-                                // global id for the points
-                                int node_gid = get_id(icount, jcount, kcount,
-                                                  num_points_i, num_points_j);
-
-                                // convert this_point index to the FE index convention
-                                int this_index = this_point; //convert_point_number_in_Hex(this_point);
-
-                                // store the points in this elem according the the finite
-                                // element numbering convention
-                                mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
-
-                                // increment the point counting index
-                                this_point = this_point + 1;
-                            } // end for icount
-                        } // end for jcount
-                    }  // end for kcount
-                } // end for i
-            } // end for j
-        } // end for k
-
-        // update device side
-        mesh.nodes_in_elem.update_device();
-
-        // initialize corner variables
-        int num_corners = num_elems * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-        // corner.initialize(num_corners, num_dim);
-
-        // Build connectivity
-        mesh.build_connectivity();
-    } // end build_3d_box
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn build_3d_HexN_box
-    ///
-    /// \brief Builds an unstructured high order 3D rectilinear mesh
-    ///
-    /// \param Simulation mesh that is built
-    /// \param Element state data
-    /// \param Node state data
-    /// \param Corner state data
-    /// \param Simulation parameters
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void build_3d_HexN_box(Mesh_t& mesh,
-        GaussPoint_t& GaussPoints,
-        node_t&   node,
-        corner_t& corner,
-        SimulationParameters_t& SimulationParamaters) const
-    {
-        printf(" ***** WARNING::  build_3d_HexN_box not yet implemented\n");
-        const int num_dim = 3;
-
-        // SimulationParamaters.mesh_input.length.update_host();
-        const double lx = SimulationParamaters.mesh_input.length[0];
-        const double ly = SimulationParamaters.mesh_input.length[1];
-        const double lz = SimulationParamaters.mesh_input.length[2];
-
-        // SimulationParamaters.mesh_input.num_elems.update_host();
-        const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0];
-        const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1];
-        const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2];
-
-        // creating zones for the Pn order
-        const int Pn_order = SimulationParamaters.mesh_input.p_order;
-        
-        if (Pn_order > 19) {
-            printf("Fierro DG and RD solvers are only valid for elements up to Pn = 19 \n");
-            return;
-        }
-
-        const int num_zones_i = Pn_order*num_elems_i;
-        const int num_zones_j = Pn_order*num_elems_j;
-        const int num_zones_k = Pn_order*num_elems_k;
-        
-        const int num_points_i = num_zones_i+1; // num points in x accounting for Pn
-        const int num_points_j = num_zones_j+1; // num points in y accounting for Pn
-        const int num_points_k = num_zones_k+1; // num points in y accounting for Pn
-        
-        
-        const double dx = lx/((double)num_zones_i);  // len/(num_zones_i)
-        const double dy = ly/((double)num_zones_j);  // len/(num_zones_j)
-        const double dz = lz/((double)num_zones_k);  // len/(num_zones_k)
-        
-        const int num_elems = num_elems_i*num_elems_j*num_elems_k;
-        // const int num_zones = num_zones_i*num_zones_j*num_zones_k; // accounts for Pn
-
-        std::vector<double> origin(num_dim);
-        for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; }
-
-        // --- 3D parameters ---
-        // const int num_faces_in_zone = 6;   // number of faces in zone
-        // const int num_points_in_zone = 8;  // number of points in zone
-        // const int num_points_in_face = 4;  // number of points in a face
-        
-        // p_order   = 1, 2, 3, 4, 5
-        // num_nodes = 2, 3, 4, 5, 6
-        const int num_1D_points = Pn_order+1;
-        const int num_points_in_elem = num_1D_points*num_1D_points*num_1D_points;
-           
-        
-        // --- elem ---
-        auto elem_coords = CArray <double> (num_elems, num_dim);
-        auto elem_point_list = CArray <int> (num_elems, num_points_in_elem);
-        
-        
-        // --- point ---
-        int num_points = num_points_i * num_points_j * num_points_k;
-        auto pt_coords = CArray <double> (num_points, num_dim);
-
-
-        // --- Build nodes ---
-        
-        // initialize node variables
-        mesh.initialize_nodes(num_points);
-
-        // 
-        std::vector<node_state> required_node_state = { node_state::coords };
-        node.initialize(num_points, num_dim, required_node_state);
-        // populate the point data structures
-        for (int k = 0; k < num_points_k; k++){
-            for (int j = 0; j < num_points_j; j++){
-                for (int i = 0; i < num_points_i; i++){
-
-                
-                    // global id for the point
-                    int node_gid = get_id(i, j, k, num_points_i, num_points_j);
-
-                    // store the point coordinates
-                    node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
-                    node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
-                    node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
-                    
-                } // end for k
-            } // end for i
-        } // end for j
-
-
-        node.coords.update_device();
-
-
-        // initialize elem variables
-        mesh.initialize_elems(num_elems, num_dim);
-
-        // --- Build elems  ---
-        
-        // populate the elem center data structures accounting for Pn
-        for (int k=0; k<num_elems_k; k++){
-            for (int j=0; j<num_elems_j; j++){
-                for (int i=0; i<num_elems_i; i++){
-                  
-                    // global id for the elem
-                    size_t elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
-                    
-                    // store the point IDs for this elem where the range is
-                    // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
-                    // (i:(i+1)*Pn_order, j:(j+1)*Pn_order, k:(k+1)*Pn_order) for a Pn hexahedron
-                    int node_lid = 0;
-                    
-                    int k_local = 0;
-                    for (int kcount=k*Pn_order; kcount<=(k+1)*Pn_order; kcount++){
-                        
-                        int j_local = 0;
-                        for (int jcount=j*Pn_order; jcount<=(j+1)*Pn_order; jcount++){
-                            
-                            int i_local = 0;
-                            for (int icount=i*Pn_order; icount<=(i+1)*Pn_order; icount++){
-                                
-                                // global id for the points
-                                size_t node_gid = get_id(icount, jcount, kcount,
-                                                  num_points_i, num_points_j);
-
-                                // Saved using i,j,k indexing
-                                mesh.nodes_in_elem.host(elem_gid, node_lid) = node_gid;
-                                
-                                // increment the point counting index
-                                node_lid = node_lid + 1;
-                                
-                                i_local++;
-                            } // end for icount
-                            
-                            j_local++;
-                        } // end for jcount
-                        
-                        k_local ++;
-                    }  // end for kcount
-                } // end for i
-            } // end for j
-        } // end for k
-
-        // update device side
-        mesh.nodes_in_elem.update_device();
-
-        // initialize corner variables
-        int num_corners = num_elems * mesh.num_nodes_in_elem;
-        mesh.initialize_corners(num_corners);
-        // corner.initialize(num_corners, num_dim);
-
-        // Build connectivity
-        mesh.build_connectivity();
-
-    }
-};
-
-/////////////////////////////////////////////////////////////////////////////
-///
-/// \class MeshWriter
-///
-/// \brief Class for writing out a mesh with its associated state from Fierro
-///
-/// This class contains the requisite functions required to write out a mesh
-/// with its associated state data from solvers in Fierro.
-///
-/////////////////////////////////////////////////////////////////////////////
-class MeshWriter
-{
-private:
-    int graphics_id = 0;
-
-public:
-
-    MeshWriter() {}
-
-    ~MeshWriter()
-    {
-    }
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn writes mesh with the format given in the input.yaml file
-    ///
-    /// \param Simulation mesh
-    /// \param Element related state
-    /// \param Node related state
-    /// \param Corner related state
-    /// \param Simulation input parameters
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_mesh(Mesh_t& mesh,
-        State_t& State,
-        SimulationParameters_t& SimulationParamaters,
-        double dt,
-        double time_value,
-        CArray<double> graphics_times,
-        std::vector<node_state> node_states,
-        std::vector<gauss_pt_state> gauss_pt_states,
-        std::vector<material_pt_state> material_pt_states,
-        const size_t solver_id)
-    {
-
-
-        // node_state is an enum for possible fields (e.g., coords, velocity, etc.), see state.h
-        // gauss_pt_state is an enum for possible fields (e.g., vol, divergence, etc.)
-        // material_pt_state is an enum for possible fields (e.g., den, pres, etc.)
-
-
-        // *******************
-        //  Update host 
-        // *******************
-
-        const size_t num_mats = State.MaterialPoints.num_material_points.size();
-
-        // material point values
-            
-        //  Update host data for mat_pt state
-        for (auto field : material_pt_states){
-            switch(field){
-                // scalar vars to write out
-                case material_pt_state::density:
-                    State.MaterialPoints.den.update_host();
-                    break;
-                case material_pt_state::pressure:
-                    State.MaterialPoints.pres.update_host();
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    State.MaterialPoints.sie.update_host();
-                    break;
-                case material_pt_state::sound_speed:
-                    State.MaterialPoints.sspd.update_host();
-                    break;
-                case material_pt_state::mass:
-                    State.MaterialPoints.mass.update_host();
-                    break;
-                case material_pt_state::volume_fraction:
-                    State.MaterialPoints.volfrac.update_host();
-                    State.MaterialPoints.geo_volfrac.update_host();
-                    break;
-                case material_pt_state::eroded_flag:
-                    State.MaterialPoints.eroded.update_host();
-                    break;
-                // tensor vars to write out
-                case material_pt_state::stress:
-                    State.MaterialPoints.stress.update_host();
-                    break;
-                
-                // additional vars for thermal-mechanical solver
-                case material_pt_state::thermal_conductivity:
-                    State.MaterialPoints.conductivity.update_host();
-                    break;
-                
-                case material_pt_state::specific_heat:
-                    State.MaterialPoints.specific_heat.update_host();
-                    break;
-
-                // add other variables here
-                
-                // not used
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-                default:
-                    std::cout<<"Desired material point state not understood in outputs"<<std::endl;
-            } // end switch
-        } // end for over mat_pt_states
-
-
-
-        // update gauss point values
-        for (auto field : gauss_pt_states){
-            switch(field){
-                // scalar vars to write out
-                case gauss_pt_state::volume:
-                    State.GaussPoints.vol.update_host();
-                    break;
-                case gauss_pt_state::divergence_velocity:
-                    State.GaussPoints.div.update_host();
-                    break;
-                case gauss_pt_state::level_set:
-                    State.GaussPoints.level_set.update_host();
-                    break;      
-
-                // tensor vars to write out
-                case gauss_pt_state::gradient_velocity:
-                    State.GaussPoints.vel_grad.update_host();
-                    break;
-                default:
-                    std::cout<<"Desired Gauss point state not understood in vtk outputs"<<std::endl;
-
-            } // end switch
-        } // end loop
-
-        // nodal values
-        for (auto field : node_states){
-            switch(field){
-                case node_state::mass:
-                    State.node.mass.update_host();
-                    break;
-                case node_state::temp:
-                    State.node.temp.update_host();
-                    break;
-                case node_state::coords:
-                    State.node.coords.update_host();
-                    break;
-                case node_state::velocity:
-                    State.node.vel.update_host();
-                    break;
-                case node_state::gradient_level_set:
-                    State.node.gradient_level_set.update_host();
-                    break;  
-
-                case node_state::force:
-                    break;
-
-                // heat transer vars
-                case node_state::heat_transfer:
-                    break;
-
-            } // end switch
-        } // end for over 
-        Kokkos::fence();
-
-
-        // ******************************************
-        //  Build Material and Element state outputs
-        // ******************************************
-
-        size_t num_mat_pt_scalar_vars = 0;
-        size_t num_mat_pt_tensor_vars = 0;
-            
-        // count the number of material point state vars to write out
-        for (auto field : SimulationParamaters.output_options.output_mat_pt_state){
-            switch(field){
-                // scalar vars to write out
-                case material_pt_state::density:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                case material_pt_state::pressure:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                case material_pt_state::sound_speed:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                case material_pt_state::mass:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                case material_pt_state::volume_fraction:
-                    num_mat_pt_scalar_vars ++; // mat volfrac
-                    num_mat_pt_scalar_vars ++; // geometric volfrac
-                    break;
-                case material_pt_state::eroded_flag:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                // tensor vars to write out
-                case material_pt_state::stress:
-                    num_mat_pt_tensor_vars ++;
-                    break;
-                
-                // additional vars for thermal-mechanical solver
-                case material_pt_state::thermal_conductivity:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-                
-                case material_pt_state::specific_heat:
-                    num_mat_pt_scalar_vars ++;
-                    break;
-
-                // add other variables here
-
-                // not used
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-                default:
-                    std::cout<<"Desired material point state not understood in outputs"<<std::endl;
-            } // end switch
-        } // end for over mat_pt_states
-
-
-
-        size_t num_elem_scalar_vars = 0;
-        size_t num_elem_vector_vars = 0;
-        size_t num_elem_tensor_vars = 0;
-
-        // count the number of element average fields to write out
-        for (auto field : SimulationParamaters.output_options.output_elem_state){
-            switch(field){
-                // scalar vars to write out
-                case material_pt_state::density:
-                    num_elem_scalar_vars ++;
-                    break;
-                case material_pt_state::pressure:
-                    num_elem_scalar_vars ++;
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    num_elem_scalar_vars ++;
-                    break;
-                case material_pt_state::sound_speed:
-                    num_elem_scalar_vars ++;
-                    break;
-                case material_pt_state::mass:
-                    num_elem_scalar_vars ++;
-                    break;
-                // tensor vars to write out
-                case material_pt_state::stress:
-                    num_elem_tensor_vars ++;
-                    break;
-
-                // additional vars for thermal-mechanical solver
-                case material_pt_state::thermal_conductivity:
-                    num_elem_scalar_vars ++;
-                    break;
-                
-                case material_pt_state::specific_heat:
-                    num_elem_scalar_vars ++;
-                    break;
-
-                // add other variables here
-
-                // not used
-                case material_pt_state::volume_fraction:
-                    break;
-                case material_pt_state::eroded_flag:
-                    break;
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-                default:
-                    std::cout<<"Desired material point state not understood in outputs"<<std::endl;
-            } // end switch
-        } // end for over mat_pt_states
-
-
-        size_t num_gauss_pt_scalar_vars = 0;
-        size_t num_gauss_pt_tensor_vars = 0;
-
-        // gauss point values to ouptput
-        for (auto field : SimulationParamaters.output_options.output_gauss_pt_state){
-            switch(field){
-                // scalar vars to write out
-                case gauss_pt_state::volume:
-                    num_gauss_pt_scalar_vars ++;
-                    break;
-                case gauss_pt_state::level_set:
-                    num_gauss_pt_scalar_vars ++;
-                    break;
-                case gauss_pt_state::divergence_velocity:
-                    num_gauss_pt_scalar_vars ++;
-                    break;
-
-                // tensor vars to write out
-                case gauss_pt_state::gradient_velocity:
-                    num_gauss_pt_tensor_vars ++;
-                    break;
-                default:
-                    std::cout<<"Desired Gauss point state not understood in vtk outputs"<<std::endl;
-
-            } // end switch
-        } // end loop
-
-        // add the Gauss point state to the element state
-        num_elem_scalar_vars += num_gauss_pt_scalar_vars;
-        num_elem_tensor_vars += num_gauss_pt_tensor_vars;
-
-
-        // Scalar, vector, and tensor value names associated with a elem
-        std::vector<std::string> elem_scalar_var_names(num_elem_scalar_vars);
-        std::vector<std::string> elem_tensor_var_names(num_elem_tensor_vars);
-
-        // Scalar, vector, and tensor values associated with a material in part elems
-        std::vector<std::string> mat_elem_scalar_var_names(num_mat_pt_scalar_vars);
-        std::vector<std::string> mat_elem_tensor_var_names(num_mat_pt_tensor_vars);
-
-
-        // the ids to access a variable in the mat_scalar_var_name or tensor list
-        int mat_den_id = -1;
-        int mat_pres_id = -1;
-        int mat_sie_id = -1;
-        int mat_sspd_id = -1;
-        int mat_mass_id = -1;
-        int mat_volfrac_id = -1;  
-        int mat_geo_volfrac_id = -1;  // geometric volume fraction of part
-        int mat_eroded_id = -1;
-        int mat_stress_id = -1;
-
-        int mat_conductivity_id = -1;
-        int mat_specific_heat_id = -1;
-
-        // the index for the scalar, vector, and tensor fields
-        size_t var = 0;
-        size_t vector_var = 0;
-        size_t tensor_var = 0;
-
-        // material point state to output
-        for (auto field : SimulationParamaters.output_options.output_mat_pt_state){
-            switch(field){
-                // scalar vars
-                case material_pt_state::density:
-                    mat_elem_scalar_var_names[var] = "mat_den";
-                    mat_den_id = var;
-                    var++;
-                    break;
-                case material_pt_state::pressure:
-                    mat_elem_scalar_var_names[var] = "mat_pres";
-                    mat_pres_id = var;
-                    var++;
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    mat_elem_scalar_var_names[var] = "mat_sie";
-                    mat_sie_id = var;
-                    var++;
-                    break;
-                case material_pt_state::sound_speed:
-                    mat_elem_scalar_var_names[var] = "mat_sspd";
-                    mat_sspd_id = var;
-                    var++;
-                    break;
-                case material_pt_state::mass:
-                    mat_elem_scalar_var_names[var] = "mat_mass";
-                    mat_mass_id = var;
-                    var++;
-                    break;
-                case material_pt_state::volume_fraction:
-                    mat_elem_scalar_var_names[var] = "mat_volfrac";
-                    mat_volfrac_id = var; 
-                    var++;
-
-                    mat_elem_scalar_var_names[var] = "mat_geo_volfrac";
-                    mat_geo_volfrac_id = var; 
-                    var++;
-                    break;
-                case material_pt_state::eroded_flag:
-                    mat_elem_scalar_var_names[var] = "mat_eroded";
-                    mat_eroded_id = var;
-                    var++;
-                    break;
-                // tensor vars
-                case material_pt_state::stress:
-                    mat_elem_tensor_var_names[tensor_var] = "mat_stress";
-                    mat_stress_id = tensor_var;
-                    tensor_var++;
-                    break;
-
-    
-                // additional vars for thermal-mechanical solver
-                case material_pt_state::thermal_conductivity:
-                    mat_elem_scalar_var_names[var] = "mat_thermal_K";
-                    mat_conductivity_id = var;
-                    var++;
-                    break;
-                
-                case material_pt_state::specific_heat:
-                    mat_elem_scalar_var_names[var] = "mat_Cp";
-                    mat_specific_heat_id = var;
-                    var++;
-                    break;
-
-
-                // add other variables here
-
-                // not used
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-            } // end switch
-        } // end for over mat_pt_states
-
-
-        // element average fields to output
-
-        // the ids to access a variable in the elem_scalar_var_name or tensor list
-        int den_id = -1;
-        int pres_id = -1;
-        int sie_id = -1;
-        int sspd_id = -1;
-        int mass_id = -1; 
-        int stress_id = -1;
-
-        int conductivity_id = -1;
-        int specific_heat_id = -1;
-
-        // reset the counters
-        var = 0;
-        vector_var = 0;
-        tensor_var = 0;
-
-        // element state to output
-        for (auto field : SimulationParamaters.output_options.output_elem_state){
-            switch(field){
-                // scalar vars
-                case material_pt_state::density:
-                    elem_scalar_var_names[var] = "den";
-                    den_id = var;
-                    var++;
-                    break;
-                case material_pt_state::pressure:
-                    elem_scalar_var_names[var] = "pres";
-                    pres_id = var;
-                    var++;
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    elem_scalar_var_names[var] = "sie";
-                    sie_id = var;
-                    var++;
-                    break;
-                case material_pt_state::sound_speed:
-                    elem_scalar_var_names[var] = "sspd";
-                    sspd_id = var;
-                    var++;
-                    break;
-                case material_pt_state::mass:
-                    elem_scalar_var_names[var] = "mass";
-                    mass_id = var;
-                    var++;
-                    break;
-                // tensor vars
-                case material_pt_state::stress:
-                    elem_tensor_var_names[tensor_var] = "stress";
-                    stress_id = tensor_var;
-                    tensor_var++;
-                    break;
-
-                // heat transfer variables
-                case material_pt_state::thermal_conductivity:
-                    elem_scalar_var_names[var] = "thermal_K";
-                    conductivity_id = var;
-                    var++;
-                    break;
-                
-                case material_pt_state::specific_heat:
-                    elem_scalar_var_names[var] = "Cp";
-                    specific_heat_id = var;
-                    var++;
-                    break;
-
-                // add other variables here
-
-                // not used
-                case material_pt_state::volume_fraction:
-                    break;
-                case material_pt_state::eroded_flag:
-                    break;
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-            } // end switch
-        } // end for over mat_pt_states
-
-        // append Gauss point vars to the element arrays
-        int vol_id = -1;
-        int div_id = -1;
-        int level_set_id = -1;
-        int vel_grad_id = -1;
-        
-
-        for (auto field : SimulationParamaters.output_options.output_gauss_pt_state){
-            switch(field){
-                // scalars
-                case gauss_pt_state::volume:
-                    elem_scalar_var_names[var] = "vol";
-                    vol_id = var;
-                    var++;
-                    break;
-                case gauss_pt_state::divergence_velocity:
-                    elem_scalar_var_names[var] = "div";
-                    div_id = var;
-                    var++;
-                    break;
-
-                case gauss_pt_state::level_set:
-                    elem_scalar_var_names[var] = "level_set";
-                    level_set_id = var;
-                    var++;
-                    break;
-
-                // tensors
-                case gauss_pt_state::gradient_velocity:
-                    elem_tensor_var_names[tensor_var] = "vel_grad";
-                    vel_grad_id = tensor_var;
-                    tensor_var++;
-                    break;
-            } // end switch
-        } // end loop over gauss_pt_states
-
-
-        // *******************
-        //  nodal values
-        // *******************
-
-        size_t num_node_scalar_vars = 0;
-        size_t num_node_vector_vars = 0;
-
-        for (auto field : SimulationParamaters.output_options.output_node_state){
-            switch(field){
-                // --- scalars
-                case node_state::mass:
-                    num_node_scalar_vars ++;
-                    break;
-                case node_state::temp:
-                    num_node_scalar_vars ++;
-                    break;
-                // -- vectors
-                case node_state::coords:
-                    num_node_vector_vars ++;
-                    break;
-                case node_state::velocity:
-                    num_node_vector_vars ++; // for velocity
-                    num_node_vector_vars ++; // for acceleration
-                    break;
-                case node_state::gradient_level_set:
-                    num_node_vector_vars ++;
-                    break;                    
-                case node_state::force:
-                    break;
-                
-                // heat transer vars
-                case node_state::heat_transfer:
-                    break;
-            } // end switch
-        } // end for over 
-        Kokkos::fence();
-
-
-        // Scalar and vector values associated with a node
-        std::vector<std::string> node_scalar_var_names(num_node_scalar_vars);
-        std::vector<std::string> node_vector_var_names(num_node_vector_vars);
-
-        int node_mass_id = -1;
-        int node_vel_id = -1;
-        int node_accel_id = -1;
-        int node_coord_id = -1;
-        int node_temp_id = -1;
-        int node_grad_level_set_id = -1;
-
-        // reset counters for node fields
-        var = 0;
-        vector_var = 0;
-        tensor_var = 0;
-
-        for (auto field : SimulationParamaters.output_options.output_node_state){
-            switch(field){
-                // scalars
-                case node_state::mass:
-                    node_scalar_var_names[var] = "node_mass";
-                    node_mass_id = var;
-                    var++;
-                    break;
-                case node_state::temp:
-                    node_scalar_var_names[var] = "node_temp";
-                    node_temp_id = var;
-                    var++;
-                    break;
-
-                // vector fields
-
-                case node_state::coords:
-                    node_vector_var_names[vector_var] = "node_coords";
-                    node_coord_id = vector_var;
-                    vector_var++;
-                    break;
-
-                case node_state::velocity:
-                    node_vector_var_names[vector_var] = "node_vel";
-                    node_vel_id = vector_var;
-                    vector_var++;
-
-                    node_vector_var_names[vector_var] = "node_accel";
-                    node_accel_id = vector_var;
-                    vector_var++;
-                    break;
-
-                case node_state::gradient_level_set:
-                    node_vector_var_names[vector_var] = "node_grad_lvlset";
-                    node_grad_level_set_id = vector_var;
-                    vector_var++;
-                    break;
-
-                // -- not used vars
-                case node_state::force:
-                    break;
-
-                // heat transer vars
-                case node_state::heat_transfer:
-                    break;
-
-                // tensors
-
-            } // end switch
-        } // end for over 
-
-
-        // **************************************
-        //  build and save element average fields
-        // **************************************
-
-        // short hand
-        const size_t num_nodes = mesh.num_nodes;
-        const size_t num_elems = mesh.num_elems;
-        const size_t num_dims  = mesh.num_dims;
-        const size_t num_nodes_in_elem = mesh.num_nodes_in_elem;
-        const int Pn_order = mesh.Pn;
-
-        // save the elem state to an array for exporting to graphics files
-        DCArrayKokkos<double> elem_scalar_fields(num_elem_scalar_vars, num_elems, "elem_scalars");
-        DCArrayKokkos<double> elem_tensor_fields(num_elem_tensor_vars, num_elems, 3, 3, "elem_tensors");
-        elem_scalar_fields.set_values(0.0);
-        elem_tensor_fields.set_values(0.0);
-
-
-        // -----------------------------------------------------------------------
-        // save the output fields to a single element average array for all state
-        // -----------------------------------------------------------------------
-        for (int mat_id = 0; mat_id < num_mats; mat_id++) {
-
-            // material point and guass point state are concatenated together
-            concatenate_elem_fields(State.MaterialPoints,
-                                    State.GaussPoints,
-                                    elem_scalar_fields,
-                                    elem_tensor_fields,
-                                    State.MaterialToMeshMaps.elem_in_mat_elem,
-                                    SimulationParamaters.output_options.output_elem_state,
-                                    SimulationParamaters.output_options.output_gauss_pt_state,
-                                    State.MaterialToMeshMaps.num_mat_elems.host(mat_id),
-                                    mat_id,
-                                    num_elems,
-                                    den_id,
-                                    pres_id,
-                                    sie_id,
-                                    sspd_id,
-                                    mass_id,
-                                    stress_id,
-                                    vol_id,
-                                    div_id,
-                                    level_set_id,
-                                    vel_grad_id,
-                                    conductivity_id,
-                                    specific_heat_id);
-        } // end for mats
-
-        // make specific fields for the element average
-        if (sie_id>=0){
-            FOR_ALL(elem_gid, 0, num_elems, {
-                // get sie by dividing by the mass
-                elem_scalar_fields(sie_id, elem_gid) /= (elem_scalar_fields(mass_id, elem_gid)+1.e-20); 
-            });
-        } // end if
-
-        Kokkos::fence();
-        elem_scalar_fields.update_host();
-        elem_tensor_fields.update_host();
-        
-
-        // ************************
-        //  Build the nodal fields 
-        // ************************
-
-        // save the nodal fields to an array for exporting to graphics files
-        DCArrayKokkos<double> node_scalar_fields(num_node_scalar_vars, num_nodes, "node_scalars");
-        DCArrayKokkos<double> node_vector_fields(num_node_vector_vars, num_nodes, 3, "node_tenors");
-      
-        concatenate_nodal_fields(State.node,
-                                 node_scalar_fields,
-                                 node_vector_fields,
-                                 SimulationParamaters.output_options.output_node_state,
-                                 dt,
-                                 num_nodes,
-                                 num_dims,
-                                 node_mass_id,
-                                 node_vel_id,
-                                 node_accel_id,
-                                 node_coord_id,
-                                 node_grad_level_set_id,
-                                 node_temp_id);
-                                 
-
-        Kokkos::fence();
-        node_scalar_fields.update_host();
-        node_vector_fields.update_host();
-
-
-        // ********************************
-        //  Write the nodal and elem fields 
-        // ********************************
-
-        if (SimulationParamaters.output_options.format == output_options::viz ||
-            SimulationParamaters.output_options.format == output_options::viz_and_state) {
-
-            // create the folder structure if it does not exist
-            struct stat st;
-
-            if (stat("vtk", &st) != 0) {
-                int returnCode = system("mkdir vtk");
-
-                if (returnCode == 1) {
-                    std::cout << "Unable to make vtk directory" << std::endl;
-                }
-            }
-            else{
-                if(solver_id==0 && graphics_id==0){
-                    // delete the existing files inside
-                    int returnCode = system("rm vtk/Fierro*");
-                    if (returnCode == 1) {
-                        std::cout << "Unable to clear vtk/Fierro directory" << std::endl;
-                    }
-                }
-            }
-
-            if (stat("vtk/data", &st) != 0) {
-                int returnCode = system("mkdir vtk/data");
-                if (returnCode == 1) {
-                    std::cout << "Unable to make vtk/data directory" << std::endl;
-                }
-            }
-            else{
-                if(solver_id==0 && graphics_id==0){
-                    // delete the existing files inside the folder
-                    int returnCode = system("rm vtk/data/Fierro*");
-                    if (returnCode == 1) {
-                        std::cout << "Unable to clear vtk/data directory" << std::endl;
-                    }
-                }
-            }
-            
-            // call the .vtu writer for element fields
-            std::string elem_fields_name = "fields";
-
-            // make a view of node coords for passing into functions
-            ViewCArray <double> node_coords_host(&State.node.coords.host(0,0), num_nodes, num_dims);
-            ViewCArray <size_t> nodes_in_elem_host(&mesh.nodes_in_elem.host(0,0), num_elems, num_nodes_in_elem);
-
-
-            write_vtu(node_coords_host,
-                      nodes_in_elem_host,
-                      elem_scalar_fields,
-                      elem_tensor_fields,
-                      node_scalar_fields,
-                      node_vector_fields,
-                      elem_scalar_var_names,
-                      elem_tensor_var_names,
-                      node_scalar_var_names,
-                      node_vector_var_names,
-                      elem_fields_name,
-                      graphics_id,
-                      num_nodes,
-                      num_elems,
-                      num_nodes_in_elem,
-                      Pn_order,
-                      num_dims,
-                      solver_id);
-
-
-            // ********************************
-            //  Build and write the mat fields 
-            // ********************************
-
-
-            // note: the file path and folder was created in the elem and node outputs
-            size_t num_mat_files_written = 0;
-            if(num_mat_pt_scalar_vars > 0 || num_mat_pt_tensor_vars >0){
-
-                for (int mat_id = 0; mat_id < num_mats; mat_id++) {
-
-                    const size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
-
-                    // only save material data if the mat lives on the mesh, ie. has state allocated
-                    if (num_mat_elems>0){
-
-                        // set the nodal vars to zero size, we don't write these fields again
-                        node_scalar_var_names.clear();
-                        node_vector_var_names.clear();
-
-                        // the arrays storing all the material field data
-                        DCArrayKokkos<double> mat_elem_scalar_fields(num_mat_pt_scalar_vars, num_mat_elems, "mat_pt_scalars");
-                        DCArrayKokkos<double> mat_elem_tensor_fields(num_mat_pt_tensor_vars, num_mat_elems, 3, 3, "mat_pt_tensors");
-
-
-                        // concatenate material fields into a single array
-                        concatenate_mat_fields(State.MaterialPoints,
-                                               mat_elem_scalar_fields,
-                                               mat_elem_tensor_fields,
-                                               State.MaterialToMeshMaps.elem_in_mat_elem,
-                                               SimulationParamaters.output_options.output_mat_pt_state,
-                                               num_mat_elems,
-                                               mat_id,
-                                               mat_den_id,
-                                               mat_pres_id,
-                                               mat_sie_id,
-                                               mat_sspd_id,
-                                               mat_mass_id,
-                                               mat_volfrac_id,
-                                               mat_geo_volfrac_id,  
-                                               mat_eroded_id,
-                                               mat_stress_id,
-                                               mat_conductivity_id,
-                                               mat_specific_heat_id);
-                        Kokkos::fence();
-                        mat_elem_scalar_fields.update_host();
-                        mat_elem_tensor_fields.update_host();
-
-
-                        std::string str_mat_val = std::to_string(mat_id);                       
-                        std::string mat_fields_name = "mat";
-                        mat_fields_name += str_mat_val;  // add the mat number
-
-                        // save the nodes belonging to this part (i.e., the material)
-                        DCArrayKokkos <double> mat_node_coords(num_nodes,num_dims, "mat_node_coords");
-                        DCArrayKokkos <size_t> mat_nodes_in_mat_elem(num_mat_elems, num_nodes_in_elem, "mat_nodes_in_mat_elem");
-
-                        // the number of actual nodes belonging to the part (i.e., the material)
-                        size_t num_mat_nodes = 0;
-
-                        // build a unique mesh (element and nodes) for the material (i.e., the part)
-                        build_material_elem_node_lists(mesh,
-                                                       State.node.coords,
-                                                       mat_node_coords,
-                                                       mat_nodes_in_mat_elem,
-                                                       State.MaterialToMeshMaps.elem_in_mat_elem,
-                                                       mat_id,
-                                                       num_mat_nodes,
-                                                       num_mat_elems,
-                                                       num_nodes_in_elem,
-                                                       num_dims);
-
-                        ViewCArray <double> mat_node_coords_host(&mat_node_coords.host(0,0), num_mat_nodes, num_dims);
-                        ViewCArray <size_t> mat_nodes_in_elem_host(&mat_nodes_in_mat_elem.host(0,0), num_mat_elems, num_nodes_in_elem);
-                        
-                        // write out a vtu file this 
-                        write_vtu(mat_node_coords_host,
-                                  mat_nodes_in_elem_host,
-                                  mat_elem_scalar_fields,
-                                  mat_elem_tensor_fields,
-                                  node_scalar_fields,
-                                  node_vector_fields,
-                                  mat_elem_scalar_var_names,
-                                  mat_elem_tensor_var_names,
-                                  node_scalar_var_names,
-                                  node_vector_var_names,
-                                  mat_fields_name,
-                                  graphics_id,
-                                  num_mat_nodes,
-                                  num_mat_elems,
-                                  num_nodes_in_elem,
-                                  Pn_order,
-                                  num_dims,
-                                  solver_id);
-
-
-                        num_mat_files_written++;
-
-                    } // end for mat_id
-
-                } // end if material is on the mesh
-
-            } // end if mat variables are to be written
-
-
-            // *************************************************
-            //  write Paraview files to open the graphics files
-            // *************************************************
-
-            // save the graphics time
-            graphics_times(graphics_id) = time_value;
-
-            // check to see if an mesh state was written 
-            bool write_mesh_state = false;
-            if( num_elem_scalar_vars > 0 ||
-                num_elem_tensor_vars > 0 ||
-                num_node_scalar_vars > 0 ||
-                num_node_vector_vars > 0)
-            {
-                write_mesh_state = true;
-            }
-
-            // check to see if a mat state was written
-            bool write_mat_pt_state = false;
-            if( num_mat_pt_scalar_vars > 0 ||
-                num_mat_pt_tensor_vars > 0)
-            {
-                 write_mat_pt_state = true;
-            }
-
-            // call the vtm file writer
-            std::string mat_fields_name = "mat";
-            write_vtm(graphics_times,
-                      elem_fields_name,
-                      mat_fields_name,
-                      time_value,
-                      graphics_id,
-                      num_mat_files_written,
-                      write_mesh_state,
-                      write_mat_pt_state,
-                      solver_id);
-
-            // call the pvd file writer
-            write_pvd(graphics_times,
-                      time_value,
-                      graphics_id,
-                      solver_id);
-
-
-            // increment graphics id counter
-            graphics_id++; // this is private variable in the class
-
-        } // end if viz paraview output is to be written
-
-
-        // STATE
-        if (SimulationParamaters.output_options.format == output_options::state ||
-            SimulationParamaters.output_options.format == output_options::viz_and_state) {
-
-            write_material_point_state(mesh,
-                                       State,
-                                       SimulationParamaters,
-                                       time_value,
-                                       graphics_times,
-                                       node_states,
-                                       gauss_pt_states,
-                                       material_pt_states);
-
-        } // end if state is to be written
-
-
-        // will drop ensight outputs in the near future
-        if (SimulationParamaters.output_options.format == output_options::ensight){
-           write_ensight(mesh,
-                         State,
-                         SimulationParamaters,
-                         dt,
-                         time_value,
-                         graphics_times,
-                         node_states,
-                         gauss_pt_states,
-                         material_pt_states);
-        }
-
-        return;
-
-    } // end write_mesh
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_ensight
-    ///
-    /// \brief Writes an ensight output file
-    ///
-    /// \param Simulation mesh
-    /// \param State data
-    /// \param Simulation parameters
-    /// \param current time value
-    /// \param Vector of all graphics output times
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_ensight(Mesh_t& mesh,
-        State_t& State,
-        SimulationParameters_t& SimulationParamaters,
-        double dt,
-        double time_value,
-        CArray<double> graphics_times,
-        std::vector<node_state> node_states,
-        std::vector<gauss_pt_state> gauss_pt_states,
-        std::vector<material_pt_state> material_pt_states)
-    {
-        size_t num_mats = State.MaterialPoints.num_material_points.size();
-
-        // ---- Update host data ----
-
-        // material point values
-        State.MaterialPoints.den.update_host();
-        State.MaterialPoints.pres.update_host();
-        State.MaterialPoints.stress.update_host();
-        State.MaterialPoints.sspd.update_host();
-        State.MaterialPoints.sie.update_host();
-        State.MaterialPoints.mass.update_host();
-        State.MaterialPoints.eroded.update_host();
-
-
-        // gauss point values
-        State.GaussPoints.vol.update_host();
-
-        // nodal values
-        State.node.coords.update_host();
-        State.node.vel.update_host();
-        State.node.mass.update_host();
-
-        Kokkos::fence();
-
-        // --------------------------
-
-        const int num_scalar_vars = 10;
-        const int num_vec_vars    = 3;
-
-        std::string name_tmp;
-        name_tmp = "Outputs_SGH";
-
-        char* name = new char [name_tmp.length() + 1];
-        std::strcpy(name, name_tmp.c_str());
-
-        const char scalar_var_names[num_scalar_vars][15] = {
-            "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch", "eroded"
-        };
-
-        const char vec_var_names[num_vec_vars][15] = {
-            "pos", "vel", "accel"
-        };
-
-        // short hand
-        const size_t num_nodes = mesh.num_nodes;
-        const size_t num_elems = mesh.num_elems;
-        const size_t num_dims  = mesh.num_dims;
-
-        // save the cell state to an array for exporting to graphics files
-        auto elem_fields = CArray<double>(num_elems, num_scalar_vars);
-        int  elem_switch = 1;
-
-
-        DCArrayKokkos<double> speed(num_elems, "speed");
-        FOR_ALL(elem_gid, 0, num_elems, {
-            double elem_vel[3]; // note:initialization with a list won't work
-            elem_vel[0] = 0.0;
-            elem_vel[1] = 0.0;
-            elem_vel[2] = 0.0;
-            // get the coordinates of the element center
-            for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
-                elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0);
-                elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1);
-                if (mesh.num_dims == 3) {
-                    elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2);
-                }
-                else{
-                    elem_vel[2] = 0.0;
-                }
-            } // end loop over nodes in element
-            elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem;
-            elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem;
-            elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem;
-
-            double speed_sqrd = 0.0;
-            for (int dim = 0; dim < num_dims; dim++) {
-                speed_sqrd += elem_vel[dim] * elem_vel[dim];
-            }
-            speed(elem_gid) = sqrt(speed_sqrd);
-        }); // end parallel for
-        speed.update_host();
-        Kokkos::fence();
-
-        // save the output scale fields to a single 2D array
-
-        // export material centeric data to the elements
-        for (int mat_id = 0; mat_id < num_mats; mat_id++) {
-            size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
-
-            for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) {
-                // 1 material per element
-
-                // get elem gid
-                size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid);
-
-                // save outputs
-                elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id, mat_elem_sid);
-                elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid);
-                elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid);
-                // 3 is guass point vol
-                elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid);
-                elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid);
-                // 6 is elem speed
-                elem_fields(elem_gid, 7) = (double)mat_id;
-                // 8 is the e_switch
-                elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid);
-            } // end for mat elems storage
-        } // end parallel loop over materials
-
-        // export element centric data
-        double e_switch = 1;
-        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-            elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid);
-            elem_fields(elem_gid, 6) = speed.host(elem_gid);
-            elem_fields(elem_gid, 8) = e_switch;
-            elem_switch *= -1;
-        } // end for elem_gid
-
-        // save the vertex vector fields to an array for exporting to graphics files
-        CArray<double> vec_fields(num_nodes, num_vec_vars, 3);
-
-        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
-            // position, var 0
-            vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0);
-            vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1);
-            if (num_dims == 2) {
-                vec_fields(node_gid, 0, 2) = 0.0;
-            }
-            else{
-                vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2);
-            }
-
-            // velocity, var 1
-            vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0);
-            vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1);
-            if (num_dims == 2) {
-                vec_fields(node_gid, 1, 2) = 0.0;
-            }
-            else{
-                vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2);
-            }
-
-            // accelleration, var 2
-            vec_fields(node_gid, 2, 0) = (State.node.vel.host(node_gid, 0) - State.node.vel_n0.host(node_gid, 0))/dt;
-            vec_fields(node_gid, 2, 1) = (State.node.vel.host(node_gid, 1) - State.node.vel_n0.host(node_gid, 1))/dt;
-            if (num_dims == 2) {
-                vec_fields(node_gid, 2, 2) = 0.0;
-            }
-            else{
-                vec_fields(node_gid, 2, 2) = (State.node.vel.host(node_gid, 2) - State.node.vel_n0.host(node_gid, 2))/dt;
-            }
-
-
-        } // end for loop over vertices
-
-
-        //  ---------------------------------------------------------------------------
-        //  Setup of file and directoring for exporting
-        //  ---------------------------------------------------------------------------
-        FILE* out[20];   // the output files that are written to
-        char  filename[128];
-        int   max_len = sizeof filename;
-        int   str_output_len;
-
-        struct stat st;
-
-        if (stat("ensight", &st) != 0) {
-            system("mkdir ensight");
-        }
-
-        if (stat("ensight/data", &st) != 0) {
-            system("mkdir ensight/data");
-        }
-
-        //  ---------------------------------------------------------------------------
-        //  Write the Geometry file
-        //  ---------------------------------------------------------------------------
-        // sprintf(filename, "ensight/data/%s.%05d.geo", name, graphics_id);
-        str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.geo", name, graphics_id);
-        // filename has the full string
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-
-        out[0] = fopen(filename, "w");
-
-        fprintf(out[0], "A graphics dump by Fierro \n");
-
-        fprintf(out[0], "%s", "EnSight Gold geometry\n");
-        fprintf(out[0], "%s", "node id assign\n");
-        fprintf(out[0], "%s", "element id assign\n");
-
-        fprintf(out[0], "part\n");
-        fprintf(out[0], "%10d\n", 1);
-        fprintf(out[0], "Mesh\n");
-
-        // --- vertices ---
-        fprintf(out[0], "coordinates\n");
-        fprintf(out[0], "%10lu\n", num_nodes);
-
-        // write all components of the point coordinates
-        for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
-            fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 0));
-        }
-
-        for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
-            fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 1));
-        }
-
-        for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
-            if (num_dims == 3) {
-                fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 2));
-            }
-            else{
-                fprintf(out[0], "%12.5e\n", 0.0);
-            }
-        }
-
-        // --- elements ---
-        if (num_dims == 3) {
-            fprintf(out[0], "hexa8\n");
-        }
-        else{
-            fprintf(out[0], "quad4\n");
-        }
-        fprintf(out[0], "%10lu\n", num_elems);
-
-
-        int convert_ijk_to_ensight[8];
-        if(mesh.num_dims==3){
-            convert_ijk_to_ensight[0] = 0;
-            convert_ijk_to_ensight[1] = 1;
-            convert_ijk_to_ensight[2] = 3;
-            convert_ijk_to_ensight[3] = 2;
-            convert_ijk_to_ensight[4] = 4;
-            convert_ijk_to_ensight[5] = 5;
-            convert_ijk_to_ensight[6] = 7;
-            convert_ijk_to_ensight[7] = 6;
-        }
-        else{
-        
-            convert_ijk_to_ensight[0] = 0;
-            convert_ijk_to_ensight[1] = 1;
-            convert_ijk_to_ensight[2] = 2;
-            convert_ijk_to_ensight[3] = 3;
-            convert_ijk_to_ensight[4] = 4;
-            convert_ijk_to_ensight[5] = 5;
-            convert_ijk_to_ensight[6] = 6;
-            convert_ijk_to_ensight[7] = 7;
-        } // end if
-
-
-        // write all global point numbers for this cell
-        for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-            for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
-                fprintf(out[0], "%10lu\t", mesh.nodes_in_elem.host(elem_gid, convert_ijk_to_ensight[node_lid]) + 1); // note: node_gid starts at 1
-            }
-            fprintf(out[0], "\n");
-        }
-
-        fclose(out[0]);
-
-        // ---------------------------------------------------------------------------
-        // Write the Scalar variable files
-        // ---------------------------------------------------------------------------
-
-        // ensight_vars = (den, pres,...)
-        for (int var = 0; var < num_scalar_vars; var++) {
-            // write a scalar value
-            // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]);
-            str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]);
-            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-
-            out[0] = fopen(filename, "w");
-
-            fprintf(out[0], "Per_elem scalar values\n");
-            fprintf(out[0], "part\n");
-            fprintf(out[0], "%10d\n", 1);
-            if (num_dims == 3) {
-                fprintf(out[0], "hexa8\n");
-            }
-            else{
-                fprintf(out[0], "quad4\n");
-            }
-
-            for (int elem_id = 0; elem_id < num_elems; elem_id++) {
-                fprintf(out[0], "%12.5e\n", elem_fields(elem_id, var));
-            }
-
-            fclose(out[0]);
-        } // end for var
-
-        //  ---------------------------------------------------------------------------
-        //  Write the Vector variable files
-        //  ---------------------------------------------------------------------------
-
-        // ensight vector vars = (position, velocity, force)
-        for (int var = 0; var < num_vec_vars; var++) {
-            // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
-            str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
-            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-
-            out[0] = fopen(filename, "w");
-            // fprintf(out[0],"Per_node vector values\n");
-            // fprintf(out[0],"part\n");
-            // fprintf(out[0],"%10d \n",1);
-            // fprintf(out[0],"hexa8\n"); // WARNING, maybe bug here?
-
-            fprintf(out[0], "Per_node vector values\n");
-            fprintf(out[0], "part\n");
-            fprintf(out[0], "%10d\n", 1);
-            fprintf(out[0], "block\n");
-
-            for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
-                fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 0));
-            }
-
-            for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
-                fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 1));
-            }
-
-            for (int node_gid = 0; node_gid < num_nodes; node_gid++) {
-                fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 2));
-            }
-
-            fclose(out[0]);
-        } // end for var
-
-        // ---------------------------------------------------------------------------
-        // Write the case file
-        // ---------------------------------------------------------------------------
-
-        // sprintf(filename, "ensight/%s.case", name);
-        str_output_len = snprintf(filename, max_len, "ensight/%s.case", name);
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-
-        out[0] = fopen(filename, "w");
-
-        fprintf(out[0], "FORMAT\n");
-        fprintf(out[0], "type: ensight gold\n");
-        fprintf(out[0], "GEOMETRY\n");
-
-        // sprintf(filename, "model: data/%s.*****.geo\n", name);
-        str_output_len = snprintf(filename, max_len, "model: data/%s.*****.geo\n", name);
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-
-        fprintf(out[0], "%s", filename);
-        fprintf(out[0], "VARIABLE\n");
-
-        for (int var = 0; var < num_scalar_vars; var++) {
-            // sprintf(filename, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]);
-            str_output_len = snprintf(filename, max_len, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]);
-            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-
-            fprintf(out[0], "%s", filename);
-        }
-
-        for (int var = 0; var < num_vec_vars; var++) {
-            // sprintf(filename, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]);
-            str_output_len = snprintf(filename, max_len, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]);
-            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-            fprintf(out[0], "%s", filename);
-        }
-
-        fprintf(out[0], "TIME\n");
-        fprintf(out[0], "time set: 1\n");
-        fprintf(out[0], "number of steps: %4d\n", graphics_id + 1);
-        fprintf(out[0], "filename start number: 0\n");
-        fprintf(out[0], "filename increment: 1\n");
-        fprintf(out[0], "time values: \n");
-
-        graphics_times(graphics_id) = time_value;
-
-        for (int i = 0; i <= graphics_id; i++) {
-            fprintf(out[0], "%12.5e\n", graphics_times(i));
-        }
-        fclose(out[0]);
-
-        // ---------------------------------------------------------------------------
-        // Done writing the graphics dump
-        // ---------------------------------------------------------------------------
-
-        // increment graphics id counter
-        graphics_id++;
-
-        delete[] name;
-
-
-        return;
-    }
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_vtk_old
-    ///
-    /// \brief Writes a vtk output file
-    ///
-    /// \param Simulation mesh
-    /// \param State data
-    /// \param Simulation parameters
-    /// \param current time value
-    /// \param Vector of all graphics output times
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_vtk_old(Mesh_t& mesh,
-        State_t& State,
-        SimulationParameters_t& SimulationParamaters,
-        double dt,
-        double time_value,
-        CArray<double> graphics_times,
-        std::vector<node_state> node_states,
-        std::vector<gauss_pt_state> gauss_pt_states,
-        std::vector<material_pt_state> material_pt_states)
-    {
-
-        size_t num_mats = State.MaterialPoints.num_material_points.size();
-
-        // ---- Update host data ----
-
-        // material point values
-        State.MaterialPoints.den.update_host();
-        State.MaterialPoints.pres.update_host();
-        State.MaterialPoints.stress.update_host();
-        State.MaterialPoints.sspd.update_host();
-        State.MaterialPoints.sie.update_host();
-        State.MaterialPoints.mass.update_host();
-        State.MaterialPoints.conductivity.update_host();
-        State.MaterialPoints.temp_grad.update_host();
-        State.MaterialPoints.eroded.update_host();
-
-
-        // gauss point values
-        State.GaussPoints.vol.update_host();
-
-        // nodal values
-        State.node.coords.update_host();
-        State.node.vel.update_host();
-        State.node.mass.update_host();
-        State.node.temp.update_host();
-
-        Kokkos::fence();
-
-
-        const int num_cell_scalar_vars = 13;
-        const int num_cell_vec_vars    = 0;
-        const int num_cell_tensor_vars = 0;
-
-        const int num_point_scalar_vars = 1;
-        const int num_point_vec_vars = 2;
-
-
-        // Scalar values associated with a cell
-        const char cell_scalar_var_names[num_cell_scalar_vars][15] = {
-            "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch","eroded", "temp_grad_x", "temp_grad_y", "temp_grad_z"
-        };
-        
-        const char cell_vec_var_names[num_cell_vec_vars][15] = {
-            
-        };
-
-        const char point_scalar_var_names[num_point_scalar_vars][15] = {
-            "temp"
-        };
-
-        const char point_vec_var_names[num_point_vec_vars][15] = {
-            "pos", "vel" 
-        };
-
-        // short hand
-        const size_t num_nodes = mesh.num_nodes;
-        const size_t num_elems = mesh.num_elems;
-        const size_t num_dims  = mesh.num_dims;
-
-        // save the cell state to an array for exporting to graphics files
-        auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
-        int  elem_switch = 1;
-
-        DCArrayKokkos<double> speed(num_elems, "speed");
-        FOR_ALL(elem_gid, 0, num_elems, {
-            double elem_vel[3]; // note:initialization with a list won't work
-            elem_vel[0] = 0.0;
-            elem_vel[1] = 0.0;
-            elem_vel[2] = 0.0;
-            // get the coordinates of the element center
-            for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
-                elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0);
-                elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1);
-                if (mesh.num_dims == 3) {
-                    elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2);
-                }
-                else{
-                    elem_vel[2] = 0.0;
-                }
-            } // end loop over nodes in element
-            elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem;
-            elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem;
-            elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem;
-
-            double speed_sqrd = 0.0;
-            for (int dim = 0; dim < num_dims; dim++) {
-                speed_sqrd += elem_vel[dim] * elem_vel[dim];
-            }
-            speed(elem_gid) = sqrt(speed_sqrd);
-        }); // end parallel for
-        speed.update_host();
-        Kokkos::fence();
-
-        // save the output scale fields to a single 2D array
-
-
-        // export material centeric data to the elements
-        for (int mat_id = 0; mat_id < num_mats; mat_id++) {
-            size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
-
-            for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) {
-                // 1 material per element
-
-                // get elem gid
-                size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid);
-
-                // save outputs
-                elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id,mat_elem_sid);
-                elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid);
-                elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid);
-                // 3 is guass point vol
-                elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid);
-                elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid);
-                // 6 is elem speed
-                elem_fields(elem_gid, 7) = (double)mat_id;
-                // 8 is the e_switch
-                elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid);
-                elem_fields(elem_gid, 10) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,0);
-                elem_fields(elem_gid, 11) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,1);
-                elem_fields(elem_gid, 12) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,2);
-            } // end for mat elems storage
-        } // end parallel loop over materials
-
-        // export element centric data
-        double e_switch = 1;
-        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-            elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid);
-            elem_fields(elem_gid, 6) = speed.host(elem_gid);
-            elem_fields(elem_gid, 8) = State.GaussPoints.div.host(elem_gid);
-            elem_switch *= -1;
-        } // end for elem_gid
-
-        // save the vertex vector fields to an array for exporting to graphics files
-        CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
-        CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
-
-        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
-            // position, var 0
-            vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0);
-            vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1);
-            if (num_dims == 2) {
-                vec_fields(node_gid, 0, 2) = 0.0;
-            }
-            else{
-                vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2);
-            }
-
-            // position, var 1
-            vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0);
-            vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1);
-            if (num_dims == 2) {
-                vec_fields(node_gid, 1, 2) = 0.0;
-            }
-            else{
-                vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2);
-            }
-
-            point_scalar_fields(node_gid, 0) = State.node.temp.host(node_gid);
-        } // end for loop over vertices
-
-
-        FILE* out[20];   // the output files that are written to
-        char  filename[100]; // char string
-        int   max_len = sizeof filename;
-        int   str_output_len;
-
-        struct stat st;
-
-        if (stat("vtk", &st) != 0) {
-            system("mkdir vtk");
-        }
-
-        // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
-
-        //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id);  // mesh file
-        str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.vtk", graphics_id);
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-         // mesh file
-        
-        out[0] = fopen(filename, "w");
-
-        fprintf(out[0], "# vtk DataFile Version 2.0\n");  // part 2
-        fprintf(out[0], "Mesh for Fierro\n");             // part 2
-        fprintf(out[0], "ASCII \n");                      // part 3
-        fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4
-
-        fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes);
-
-        // write all components of the point coordinates
-        for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
-            fprintf(out[0],
-                    "%f %f %f\n",
-                    State.node.coords.host(node_gid, 0),
-                    State.node.coords.host(node_gid, 1),
-                    State.node.coords.host(node_gid, 2));
-        } // end for
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the elems
-        ---------------------------------------------------------------------------
-        */
-
-        fprintf(out[0], "\n");
-        fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem);  // size=all printed values
-
-        int Pn_order   = mesh.Pn;
-        int order[3]   = { Pn_order, Pn_order, Pn_order };
-
-        // const int num_1D_points = Pn_order+1;
-
-        // write all global point numbers for this elem
-        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
-            fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem
-
-            for (int k = 0; k <= Pn_order; k++) {
-                for (int j = 0; j <= Pn_order; j++) {
-                    for (int i = 0; i <= Pn_order; i++) {
-                        size_t node_lid = PointIndexFromIJK(i, j, k, order);
-                        fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid));
-                    }
-                }
-            }
-
-            fprintf(out[0], "\n");
-        } // end for
-
-        // Write the element types
-        fprintf(out[0], "\n");
-        fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems);
-        // VTK_LAGRANGE_HEXAHEDRON: 72,
-        // VTK_HIGHER_ORDER_HEXAHEDRON: 67
-        // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
-        // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
-        // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
-        // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
-        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
-            fprintf(out[0], "%d \n", 72);
-        }
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the nodal vector variables to file
-        ---------------------------------------------------------------------------
-        */
-
-        fprintf(out[0], "\n");
-        fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes);
-
-        // vtk vector vars = (position, velocity)
-        for (int var = 0; var < num_point_vec_vars; var++) {
-            fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]);
-            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
-                fprintf(out[0], "%f %f %f\n",
-                        vec_fields(node_gid, var, 0),
-                        vec_fields(node_gid, var, 1),
-                        vec_fields(node_gid, var, 2));
-            } // end for nodes
-        } // end for vec_vars
-
-
-        // vtk scalar vars = (temp)
-        for (int var = 0; var < num_point_scalar_vars; var++) {
-            fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]);
-            fprintf(out[0], "LOOKUP_TABLE default\n");
-            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
-                fprintf(out[0], "%f\n",
-                        point_scalar_fields(node_gid, 0));
-            } // end for nodes
-        } // end for vec_vars
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the scalar elem variable to file
-        ---------------------------------------------------------------------------
-        */
-        fprintf(out[0], "\n");
-        fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems);
-
-        for (int var = 0; var < num_cell_scalar_vars; var++) {
-            fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4]
-            fprintf(out[0], "LOOKUP_TABLE default\n");
-            for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
-                fprintf(out[0], "%f\n", elem_fields(elem_gid, var));
-            } // end for elem
-        } // end for cell scalar_vars
-
-        fclose(out[0]);
-
-        graphics_times(graphics_id) = time_value;
-
-        // Write time series metadata
-        //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id);  // mesh file
-        str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); 
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-        // mesh file
-
-        out[0] = fopen(filename, "w");
-
-        fprintf(out[0], "{\n");
-        fprintf(out[0], "  \"file-series-version\" : \"1.0\",\n");
-        fprintf(out[0], "  \"files\" : [\n");
-
-        for (int i = 0; i <= graphics_id; i++) {
-            fprintf(out[0], "    { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) );
-        }
-
-        // fprintf(out[0], "%12.5e\n", graphics_times(i));
-        fprintf(out[0], "  ]\n"); // part 4
-        fprintf(out[0], "}"); // part 4
-
-        fclose(out[0]);
-
-        // increment graphics id counter
-        graphics_id++;
-
-
-    } // end write vtk old
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn concatenate_elem_fields
-    ///
-    /// \brief A function to calculate the average of elem fields and concatentate into 1 array
-    ///
-    ///
-    /// \param MaterialPoints a struct containing the material point state arrays
-    /// \param elem_scalar_fields the scalar fields
-    /// \param elem_tensor_fields the tensor fields
-    /// \param elem_in_mat_elem a listing of the element ids the material resides in
-    /// \param output_elem_state a std::vector of enums specifying the elem avg outputs
-    /// \param num_mat_elems the number of elements the material resides in
-    /// \param mat_id the index for the material
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void concatenate_elem_fields(const MaterialPoint_t& MaterialPoints,
-                                 const GaussPoint_t& GaussPoints,
-                                 DCArrayKokkos<double>& elem_scalar_fields,
-                                 DCArrayKokkos<double>& elem_tensor_fields,
-                                 const DRaggedRightArrayKokkos<size_t>& elem_in_mat_elem,
-                                 const std::vector<material_pt_state>& output_elem_state,
-                                 const std::vector<gauss_pt_state>& output_gauss_pt_states,
-                                 const size_t num_mat_elems,
-                                 const size_t mat_id,
-                                 const size_t num_elems,
-                                 const int den_id,
-                                 const int pres_id,
-                                 const int sie_id,
-                                 const int sspd_id,
-                                 const int mass_id,
-                                 const int stress_id,
-                                 const int vol_id,
-                                 const int div_id,
-                                 const int level_set_id,
-                                 const int vel_grad_id,
-                                 const int conductivity_id,
-                                 const int specific_heat_id)
-    {
-
-        // --- loop over the material point states
-
-        for (auto field : output_elem_state){
-            switch(field){
-                // scalar vars
-                case material_pt_state::density:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        elem_scalar_fields(den_id, elem_gid) += MaterialPoints.den(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::pressure:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        elem_scalar_fields(pres_id, elem_gid) += MaterialPoints.pres(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        // extensive ie here, but after this function, it will become specific ie
-                        elem_scalar_fields(sie_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.sie(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::sound_speed:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        elem_scalar_fields(sspd_id, elem_gid) += MaterialPoints.sspd(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
-                                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::mass:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        elem_scalar_fields(mass_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid);
-                    });
-                    break;
-                // ---------------    
-                // tensor vars
-                // ---------------
-                case material_pt_state::stress:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        // average tensor fields, it is always 3D
-                        // note: paraview is row-major, CArray convention
-                        for (size_t i=0; i<3; i++){
-                            for(size_t j=0; j<3; j++){
-
-                                // stress tensor 
-                                elem_tensor_fields(stress_id, elem_gid, i, j) +=
-                                                MaterialPoints.stress(mat_id, mat_elem_sid,i,j) *
-                                                MaterialPoints.volfrac(mat_id, mat_elem_sid)*
-                                                MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                            } // end for
-                        } // end for
-                    });
-                    break;
-
-                // thermal solver vars
-                case material_pt_state::thermal_conductivity:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        elem_scalar_fields(conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid)*
-                                                                             MaterialPoints.volfrac(mat_id, mat_elem_sid)*
-                                                                             MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                    });
-                    break;
-
-                case material_pt_state::specific_heat:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        elem_scalar_fields(specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid)*
-                                                                              MaterialPoints.volfrac(mat_id, mat_elem_sid)*
-                                                                              MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                    });
-                    break;
-
-
-                // add other variables here
-
-                // not used variables
-                case material_pt_state::volume_fraction:
-                    break;
-                case material_pt_state::eroded_flag:
-                    break;
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-            } // end switch
-        }// end for over mat point state
-
-        
-        // --- add loop over gauss points ---
-
-        // export element centric data
-        for (auto field : output_gauss_pt_states){
-            switch(field){
-                // scalars
-                case gauss_pt_state::volume:
-
-                    FOR_ALL(elem_gid, 0, num_elems, {
-                        elem_scalar_fields(vol_id, elem_gid) = GaussPoints.vol(elem_gid);
-                    });
-
-                    break;
-                case gauss_pt_state::divergence_velocity:
-
-                    FOR_ALL(elem_gid, 0, num_elems, {
-                        elem_scalar_fields(div_id, elem_gid) = GaussPoints.div(elem_gid);
-                    });
-
-                    break;
-
-                case gauss_pt_state::level_set:
-
-                    FOR_ALL(elem_gid, 0, num_elems, {
-                        elem_scalar_fields(level_set_id, elem_gid) = GaussPoints.level_set(elem_gid);
-                    });
-
-                    break;
-
-                // tensors
-                case gauss_pt_state::gradient_velocity:
-                    // note: paraview is row-major, CArray convention
-                    FOR_ALL(elem_gid, 0, num_elems, {
-                        for (size_t i=0; i<3; i++){
-                            for(size_t j=0; j<3; j++){
-                                elem_tensor_fields(vel_grad_id, elem_gid, i, j) = 
-                                                    GaussPoints.vel_grad(elem_gid, i, j);
-                            }
-                        } // end for
-                    });
-
-                    break;
-
-                // add other gauss variables here
-
-            } // end switch
-        } // end loop over gauss_pt_states
-
-
-        // --- add end gauss point loop --
-
-    } // end of function
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn concatenate_mat_fields
-    ///
-    /// \brief A function to concatentate material fields into 1 array
-    ///
-    ///
-    /// \param MaterialPoints a struct containing the material point state arrays
-    /// \param elem_scalar_fields the scalar fields
-    /// \param elem_tensor_fields the tensor fields
-    /// \param elem_in_mat_elem a listing of the element ids the material resides in
-    /// \param output_material_pt_states a std::vector of enums specifying the model
-    /// \param num_mat_elems the number of elements the material resides in
-    /// \param mat_id the index for the material
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void concatenate_mat_fields(const MaterialPoint_t& MaterialPoints,
-                                DCArrayKokkos<double>& mat_elem_scalar_fields,
-                                DCArrayKokkos<double>& mat_elem_tensor_fields,
-                                const DRaggedRightArrayKokkos<size_t>& elem_in_mat_elem,
-                                const std::vector<material_pt_state>& output_material_pt_states,
-                                const size_t num_mat_elems,
-                                const size_t mat_id,
-                                const int mat_den_id,
-                                const int mat_pres_id,
-                                const int mat_sie_id,
-                                const int mat_sspd_id,
-                                const int mat_mass_id,
-                                const int mat_volfrac_id,  
-                                const int mat_geo_volfrac_id,  
-                                const int mat_eroded_id,
-                                const int mat_stress_id,
-                                const int mat_conductivity_id,
-                                const int mat_specific_heat_id)
-    {
-      
-        // --- loop over the material point states
-
-        for (auto field : output_material_pt_states){
-            switch(field){
-                // scalar vars
-                case material_pt_state::density:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        mat_elem_scalar_fields(mat_den_id, mat_elem_sid) = MaterialPoints.den(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::pressure:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        mat_elem_scalar_fields(mat_pres_id, mat_elem_sid) = MaterialPoints.pres(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::specific_internal_energy:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        // extensive ie here, but after this function, it will become specific ie
-                        mat_elem_scalar_fields(mat_sie_id, mat_elem_sid) = MaterialPoints.sie(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::sound_speed:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        mat_elem_scalar_fields(mat_sspd_id, mat_elem_sid) = MaterialPoints.sspd(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::mass:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        mat_elem_scalar_fields(mat_mass_id, mat_elem_sid) = MaterialPoints.mass(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::volume_fraction:
-                    // material volume fraction
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        // this is the volume fraction of a material within a part
-                        mat_elem_scalar_fields(mat_volfrac_id, mat_elem_sid) = MaterialPoints.volfrac(mat_id, mat_elem_sid);
-                    });
-
-                    // geometric volume fraction
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        // this is the geometric volume fraction (interface reconstruction)
-                        mat_elem_scalar_fields(mat_geo_volfrac_id, mat_elem_sid) = MaterialPoints.geo_volfrac(mat_id, mat_elem_sid);
-                    });
-                    break;
-                case material_pt_state::eroded_flag:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        mat_elem_scalar_fields(mat_eroded_id, mat_elem_sid) = (double)MaterialPoints.eroded(mat_id, mat_elem_sid);
-                    });
-                    break;
-                // ---------------    
-                // tensor vars
-                // ---------------
-                case material_pt_state::stress:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // field
-                        // average tensor fields, it is always 3D
-                        // note: paraview is row-major, CArray convention
-                        for (size_t i=0; i<3; i++){
-                            for(size_t j=0; j<3; j++){
-
-                                // stress tensor 
-                                mat_elem_tensor_fields(mat_stress_id, mat_elem_sid, i, j) =
-                                                MaterialPoints.stress(mat_id, mat_elem_sid,i,j);
-                            } // end for
-                        } // end for
-                    });
-                    break;
-
-                // thermal solver vars
-                case material_pt_state::thermal_conductivity:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        mat_elem_scalar_fields(mat_conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid);
-                    });
-                    break;
-
-                case material_pt_state::specific_heat:
-                    FOR_ALL(mat_elem_sid, 0, num_mat_elems, {
-
-                        // get elem gid
-                        size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-
-                        // field
-                        mat_elem_scalar_fields(mat_specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid);
-                    });
-                    break;
-
-                // add other variables here
-
-                // not used variables
-                case material_pt_state::elastic_modulii:
-                    break;
-                case material_pt_state::shear_modulii:
-                    break;
-                case material_pt_state::poisson_ratios:
-                    break;
-                case material_pt_state::heat_flux:
-                    break;
-            } // end switch
-        }// end for over mat point state
-
-
-    } // end of function
-    
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn concatenate_nodal_fields
-    ///
-    /// \brief A function to calculate the average of elem fields
-    ///
-    ///
-    /// \param Node a struct containing the material point state arrays
-    /// \param elem_scalar_fields the scalar fields
-    /// \param elem_tensor_fields the tensor fields
-    /// \param elem_in_mat_elem a listing of the element ids the material resides in
-    /// \param output_node_states a std::vector of enums specifying the model
-    /// \param num_mat_elems the number of elements the material resides in
-    /// \param mat_id the index for the material
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void concatenate_nodal_fields(const node_t& Node,
-                                  DCArrayKokkos<double>& node_scalar_fields,
-                                  DCArrayKokkos<double>& node_vector_fields,
-                                  std::vector<node_state>& output_node_states,
-                                  double dt,
-                                  const size_t num_nodes,
-                                  const size_t num_dims,
-                                  const int node_mass_id,
-                                  const int node_vel_id,
-                                  const int node_accel_id,
-                                  const int node_coord_id,
-                                  const int node_grad_level_set_id,
-                                  const int node_temp_id)
-    {
-        for (auto field : output_node_states){
-            switch(field){
-                // scalars
-                case node_state::mass:
-
-                    FOR_ALL(node_gid, 0, num_nodes, {
-                        node_scalar_fields(node_mass_id, node_gid) = Node.mass(node_gid);
-                    });
-
-                    break;
-                case node_state::temp:
-                    FOR_ALL(node_gid, 0, num_nodes, {
-                        node_scalar_fields(node_temp_id, node_gid) = Node.temp(node_gid);
-                    });
-
-                    break;
-
-                // vector fields
-
-                case node_state::coords:
-
-                    FOR_ALL(node_gid, 0, num_nodes, {
-
-                        node_vector_fields(node_coord_id, node_gid, 0) = Node.coords(node_gid, 0);
-                        node_vector_fields(node_coord_id, node_gid, 1) = Node.coords(node_gid, 1);
-                        if (num_dims == 2) {
-                            node_vector_fields(node_coord_id, node_gid, 2) = 0.0;
-                        }
-                        else{
-                            node_vector_fields(node_coord_id, node_coord_id, 2) = Node.coords(node_gid, 2);
-                        } // end if
-
-                    }); // end parallel for
-
-                    break;
-                case node_state::velocity:
-
-                    FOR_ALL(node_gid, 0, num_nodes, {
-
-                        // velocity, var is node_vel_id 
-                        node_vector_fields(node_vel_id, node_gid, 0) = Node.vel(node_gid, 0);
-                        node_vector_fields(node_vel_id, node_gid, 1) = Node.vel(node_gid, 1);
-                        if (num_dims == 2) {
-                            node_vector_fields(node_vel_id, node_gid, 2) = 0.0;
-                        }
-                        else{
-                            node_vector_fields(node_vel_id, node_gid, 2) = Node.vel(node_gid, 2);
-                        } // end if
-
-                        // accellerate, var is node_accel_id            
-                        node_vector_fields(node_accel_id, node_gid, 0) = (Node.vel(node_gid, 0) - Node.vel_n0(node_gid, 0))/dt;
-                        node_vector_fields(node_accel_id, node_gid, 1) = (Node.vel(node_gid, 1) - Node.vel_n0(node_gid, 1))/dt;
-                        if (num_dims == 2) {
-                            node_vector_fields(node_accel_id, node_gid, 2) = 0.0;
-                        }
-                        else{
-                            node_vector_fields(node_accel_id, node_gid, 2) = (Node.vel(node_gid, 2) - Node.vel_n0(node_gid, 2))/dt;
-                        } // end if
-
-                    }); // end parallel for
-
-                    break;
-                    
-                    
-                case node_state::gradient_level_set:
-
-                    FOR_ALL(node_gid, 0, num_nodes, {
-
-                        // velocity, var is node_vel_id 
-                        node_vector_fields(node_grad_level_set_id, node_gid, 0) = Node.gradient_level_set(node_gid, 0);
-                        node_vector_fields(node_grad_level_set_id, node_gid, 1) = Node.gradient_level_set(node_gid, 1);
-                        if (num_dims == 2) {
-                            node_vector_fields(node_grad_level_set_id, node_gid, 2) = 0.0;
-                        }
-                        else{
-                            node_vector_fields(node_grad_level_set_id, node_gid, 2) = Node.gradient_level_set(node_gid, 2);
-                        } // end if
-
-                    }); // end parallel for
-
-                    break;                
-                
-                // -- not used vars
-                case node_state::force:
-                    break;
-
-                // heat transer vars
-                case node_state::heat_transfer:
-                    break;
-                // tensors
-            } // end switch
-        } // end for over
-
-        
-
-    } // end function
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_vtu
-    ///
-    /// \brief Writes a vtu ASCII output file
-    ///
-    /// \param Simulation mesh
-    /// \param State data
-    /// \param Simulation parameters
-    /// \param current time value
-    /// \param Vector of all graphics output times
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_vtu(
-        const ViewCArray<double>& node_coords_host,
-        const ViewCArray<size_t>& nodes_in_elem_host,
-        const DCArrayKokkos<double>& elem_scalar_fields,
-        const DCArrayKokkos<double>& elem_tensor_fields,
-        const DCArrayKokkos<double>& node_scalar_fields,
-        const DCArrayKokkos<double>& node_vector_fields,
-        const std::vector<std::string>& elem_scalar_var_names,
-        const std::vector<std::string>& elem_tensor_var_names,
-        const std::vector<std::string>& node_scalar_var_names,
-        const std::vector<std::string>& node_vector_var_names,
-        const std::string partname,
-        const int graphics_id,
-        const size_t num_nodes,
-        const size_t num_elems,
-        const size_t num_nodes_in_elem,
-        const int Pn_order,
-        const size_t num_dims,
-        const size_t solver_id
-        )
-    {
-        FILE* out[20];   // the output files that are written to
-        char  filename[100]; // char string
-        int   max_len = sizeof filename;
-        int   str_output_len;
-
-        const size_t num_elem_scalar_vars = elem_scalar_var_names.size();
-        const size_t num_elem_tensor_vars = elem_tensor_var_names.size();
-
-        const size_t num_node_scalar_vars = node_scalar_var_names.size();
-        const size_t num_node_vector_vars = node_vector_var_names.size();
-
-
-        // create filename
-        str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%s.%05d.vtu", 
-                                                                 solver_id, partname.c_str(), graphics_id);
-
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-        // mesh file
-        
-        out[0] = fopen(filename, "w");
-
-        fprintf(out[0], "<?xml version=\"1.0\"?>\n");  
-        fprintf(out[0], "<VTKFile type=\"UnstructuredGrid\" version=\"1.0\" byte_order=\"LittleEndian\">\n"); 
-        fprintf(out[0], "  <UnstructuredGrid> \n");
-        fprintf(out[0], "    <Piece NumberOfPoints=\"%zu\" NumberOfCells=\"%zu\">\n", num_nodes, num_elems); 
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the mesh points
-        ---------------------------------------------------------------------------
-        */
-        fprintf(out[0], "\n");
-        fprintf(out[0], "      <!-- Define the mesh nodes -->\n");
-        fprintf(out[0], "      <Points>\n");
-        fprintf(out[0], "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n");
-
-        // write all components of the point coordinates
-        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
-            double coord_z = 0.0;
-            if(num_dims==3){
-                coord_z = node_coords_host(node_gid, 2);
-            } 
-            fprintf(out[0],
-                    "          %f %f %f\n",
-                    node_coords_host(node_gid, 0),
-                    node_coords_host(node_gid, 1),
-                    coord_z);
-        } // end for
-        fprintf(out[0], "        </DataArray>\n");
-        fprintf(out[0], "      </Points>\n");
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the elems
-        ---------------------------------------------------------------------------
-        */
-        fprintf(out[0], "\n");
-        fprintf(out[0], "      <!-- Define the elements -->\n");
-        fprintf(out[0], "      <Cells>\n");
-        fprintf(out[0], "        <DataArray type=\"Int32\" Name=\"connectivity\" format=\"ascii\">\n");  
-
-        // WARNING: look into high-order Pn 2D elements with paraview
-        int Pn_order_z = 0;
-        if (num_dims == 3){
-            Pn_order_z = Pn_order;
-        }
-        int order[3] = {Pn_order, Pn_order, Pn_order_z};
-
-        // const int num_1D_points = Pn_order+1;
-
-        // write all global point numbers for this elem
-        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-            fprintf(out[0], "          ");  // adding indentation before printing nodes in element
-            if (num_dims==3 && Pn_order>1){
-                for (int k = 0; k <= Pn_order_z; k++) {
-                    for (int j = 0; j <= Pn_order; j++) {
-                        for (int i = 0; i <= Pn_order; i++) {
-                            size_t node_lid = PointIndexFromIJK(i, j, k, order);
-                            fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid));
-                        }
-                    }
-                } // end for
-            }
-            else if (num_dims == 3 && Pn_order == 1){
-               // 3D linear hexahedral elements
-                for (int node_lid = 0; node_lid < 8; node_lid++) {
-                    fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid));
-                } // end for
-            }
-            else if (num_dims == 2){
-                // 2D linear is the only supported option
-                for (int node_lid = 0; node_lid < 4; node_lid++) {
-                    fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid));
-                } // end for
-            }
-            else {
-                std::cout << "ERROR: outputs failed, dimensions and element types are not compatible \n";
-            } // end if
-            fprintf(out[0], "\n");
-        } // end for
-        fprintf(out[0], "        </DataArray>\n");
-
-        // Write the element offsets
-        fprintf(out[0], "        <DataArray type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");  
-        size_t count=0;
-        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-            count += num_nodes_in_elem;
-            fprintf(out[0], "          %lu\n", count); // num points in this elem + all others before it
-        } // end for
-        fprintf(out[0], "        </DataArray>\n");
-
-
-        // Write the element types
-        fprintf(out[0], "        <DataArray type=\"Int8\" Name=\"types\" format=\"ascii\">\n"); 
-        // ----
-        // linear element types
-        //   VTK_PIXEL = 8,   linear 2D quad with i,j,k indexing (future format for 2D solver)
-        //   VTK_Quad = 9,    linear 2D quad with ensight index ordering (current 2D rz convention)
-        //   VTK_VOXEL = 11,  linear 3D hex with i,j,k indexing (current format)
-        // arbitrary order types
-        //   VTK_LAGRANGE_QUADRILATERAL = 70, use this type when a 2D high-order scheme exists
-        //   VTK_LAGRANGE_HEXAHEDRON: 72, this is the current 3D high-order 
-        //   VTK_HIGHER_ORDER_HEXAHEDRON: 67
-        //   VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
-        // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
-        // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
-        // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
-        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-            if (num_dims==3 && Pn_order>1){
-                fprintf(out[0], "          %d \n", 72);
-            }
-            else if (num_dims == 3 && Pn_order == 1){
-                // 3D linear hex
-                fprintf(out[0], "          %d \n", 11);
-            }
-            else {
-                // 2D ensight mesh ordering
-                fprintf(out[0], "          %d \n", 9);
-            }
-        }
-        fprintf(out[0], "        </DataArray>\n");
-        fprintf(out[0], "      </Cells>\n");
-
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the nodal variables to file
-        ---------------------------------------------------------------------------
-        */
-        // vtk vector vars = (position, velocity)
-        fprintf(out[0], "\n");
-        fprintf(out[0], "      <!-- Define the node vector data -->\n");
-        if(num_node_vector_vars >0 || num_node_scalar_vars>0){
-
-            fprintf(out[0], "      <PointData>\n");
-
-            // node vectors
-            for (int a_var = 0; a_var < num_node_vector_vars; a_var++) {
-                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\" format=\"ascii\">\n", node_vector_var_names[a_var].c_str());
-               
-                for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
-                    fprintf(out[0], "          %f %f %f\n",
-                            node_vector_fields.host(a_var, node_gid, 0),
-                            node_vector_fields.host(a_var, node_gid, 1),
-                            node_vector_fields.host(a_var, node_gid, 2));
-                } // end for nodes
-                fprintf(out[0], "        </DataArray>\n");
-
-            } // end for vec_vars
-
-
-            // node scalar vars
-            for (int a_var = 0; a_var < num_node_scalar_vars; a_var++) {
-                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", node_scalar_var_names[a_var].c_str());
-                for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
-                    fprintf(out[0], "          %f\n", node_scalar_fields.host(a_var, node_gid));
-                } // end for nodes
-                fprintf(out[0], "        </DataArray>\n");
-            } // end for vec_vars
-
-            fprintf(out[0], "      </PointData>\n");
-
-        } // end if
-
-        /*
-        ---------------------------------------------------------------------------
-        Write the elem variables to file
-        ---------------------------------------------------------------------------
-        */
-        fprintf(out[0], "\n");
-        fprintf(out[0], "      <!-- Define the cell data -->\n");
-        if(num_elem_scalar_vars >0 || num_elem_tensor_vars>0){
-
-            fprintf(out[0], "      <CellData>\n");
-
-            for (int a_var = 0; a_var < num_elem_scalar_vars; a_var++) {
-
-                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", elem_scalar_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4]
-
-                for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-                    fprintf(out[0], "          %f\n", elem_scalar_fields.host(a_var, elem_gid));
-                } // end for elem
-                fprintf(out[0], "        </DataArray>\n");
-            } // end for elem scalar_vars
-
-
-            // tensors
-            for (int a_var = 0; a_var < num_elem_tensor_vars; a_var++) {
-                fprintf(out[0], "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"9\" format=\"ascii\">\n", elem_tensor_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4]
-                
-                for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
-                    // note: paraview is row-major, CArray convention
-                    // Txx  Txy  Txz  Tyx  Tyy  Tyz  Tzx  Tzy  Tzz
-                    for (size_t i=0; i<3; i++){
-                        for(size_t j=0; j<3; j++){
-                            fprintf(out[0], "          %f ", elem_tensor_fields.host(a_var, elem_gid, i, j));
-                        } // end j
-                    } // end i
-                } // end for elem
-                fprintf(out[0], "\n");
-                fprintf(out[0], "        </DataArray>\n");
-            } // end for elem scalar_vars
-
-            fprintf(out[0], "      </CellData>\n");
-        } // end if
-
-        // end of the vtu file
-        fprintf(out[0], "    </Piece>\n");
-        fprintf(out[0], "  </UnstructuredGrid>\n");
-        fprintf(out[0], "</VTKFile>\n");
-        
-        //-----------------
-        // close the vtu file for element fields
-        //-----------------
-        fclose(out[0]);
-
-    } // end write vtu
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_pvd
-    ///
-    /// \brief Writes a pvd ASCII output file for the element and nodal fields
-    ///
-    /// \param Vector of all graphics output times
-    /// \param element average field names
-    /// \param current time value
-    /// \param graphics index
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_pvd(CArray<double>& graphics_times,
-                   double time_value,
-                   int graphics_id,
-                   const size_t solver_id){
-
-        FILE* out[20];   // the output files that are written to
-        char  filename[100]; // char string
-        int   max_len = sizeof filename;
-        int   str_output_len;
-
-        // Write time series metadata
-        str_output_len = snprintf(filename, max_len, "vtk/Fierro.solver%zu.pvd", solver_id); 
-
-        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-        // mesh file
-
-        out[0] = fopen(filename, "w");
- 
-        fprintf(out[0], "<?xml version=\"1.0\"?>\n");
-        fprintf(out[0], "<VTKFile type=\"Collection\" version=\"1.0\" byte_order=\"LittleEndian\">\n");
-        fprintf(out[0], "  <Collection>\n");
-
-        for (int i = 0; i <= graphics_id; i++) {
-            fprintf(out[0], "    <DataSet timestep=\"%12.5e\" file=\"data/Fierro.solver%zu.%05d.vtm\" time= \"%12.5e\" />\n", 
-                                                     graphics_times(i), solver_id, i, graphics_times(i) );
-            //fprintf(out[0], "    <DataSet timestep=\"%d\" file=\"data/Fierro.solver%zu.%05d.vtm\" time= \"%12.5e\" />\n", 
-            //                                         i, solver_id, i, graphics_times(i) );
-        }
-
-        fprintf(out[0], "  </Collection>\n");
-        fprintf(out[0], "</VTKFile>"); 
-
-        fclose(out[0]);
-
-    } // end pvd
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_vtm
-    ///
-    /// \brief Writes a vtm ASCII output file for all fields -- mesh and material
-    ///
-    /// \param Vector of all graphics output times
-    /// \param element average field names
-    /// \param current time value
-    /// \param graphics index
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_vtm(CArray<double>& graphics_times,
-                   const  std::string& elem_part_name,
-                   const  std::string& mat_part_name,
-                   double time_value,
-                   int graphics_id,
-                   int num_mats,
-                   bool write_mesh_state,
-                   bool write_mat_pt_state,
-                   const size_t solver_id)
-    {
-        // loop over all the files that were written 
-        for(int file_id=0; file_id<=graphics_id; file_id++){
-
-            FILE* out[20];   // the output files that are written to
-            char  filename[100]; // char string
-            int   max_len = sizeof filename;
-            int   str_output_len;
-
-
-            // Write time series metadata to the data file
-            str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%05d.vtm", solver_id, file_id); 
-
-            if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
-            // mesh file
-
-            out[0] = fopen(filename, "w");
-    
-            fprintf(out[0], "<?xml version=\"1.0\"?>\n");
-            fprintf(out[0], "<VTKFile type=\"vtkMultiBlockDataSet\" version=\"1.0\" byte_order=\"LittleEndian\" header_type=\"UInt64\">\n");
-            fprintf(out[0], "  <vtkMultiBlockDataSet>\n");
-
-            
-            // Average mesh fields -- node and elem state written
-            size_t block_id = 0;  // this will need to be incremented based on the number of mesh fields written
-            if (write_mesh_state){
-                fprintf(out[0], "    <Block index=\"%zu\" name=\"Mesh\">\n", block_id);
-                {
-                    block_id++;  // increment block id for material outputs that follow the element avg block
-
-                    // elem and nodal fields are in this file
-                    fprintf(out[0], "      <Piece index=\"0\" name=\"Field\">\n");
-                    fprintf(out[0], "        <DataSet timestep=\"%d\" file=\"Fierro.solver%zu.%s.%05d.vtu\" time= \"%12.5e\" />\n", 
-                                                              file_id, solver_id, elem_part_name.c_str(), file_id, graphics_times(file_id) );
-                    fprintf(out[0], "      </Piece>\n");
-
-                    // add other Mesh average output Pieces here
-                }
-                fprintf(out[0], "    </Block>\n");
-            } // end if write elem and node state is true
-
-            // note: the block_id was incremented if an element average field output was made
-            if (write_mat_pt_state){
-                fprintf(out[0], "    <Block index=\"%zu\" name=\"Mat\">\n", block_id);
-                for (size_t mat_id=0; mat_id<num_mats; mat_id++){
-                    
-                    // output the material specific fields
-                    fprintf(out[0], "      <Piece index=\"%zu\" name=\"Mat%zu\">\n", mat_id, mat_id);
-                    fprintf(out[0], "        <DataSet timestep=\"%d\" file=\"Fierro.solver%zu.%s%zu.%05d.vtu\" time= \"%12.5e\" />\n", 
-                                                               file_id, solver_id, mat_part_name.c_str(), mat_id, file_id, graphics_times(file_id) );
-                    fprintf(out[0], "      </Piece>\n");
-
-                } // end for loop mat_id
-                fprintf(out[0], "    </Block>\n");
-            } // end if write mat satte is true
-
-            // done writing the files to be read by the vtm file
-            fprintf(out[0], "  </vtkMultiBlockDataSet>\n");
-            fprintf(out[0], "</VTKFile>"); 
-
-            fclose(out[0]);
-
-        } // end for file_id
-
-    } // end vtm
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn build_material_elem_node_lists
-    ///
-    /// \brief Creates elems and nodes for a unique mesh of a material (i.e, a part)
-    ///
-    /// \param Simulation mesh
-    /// \param State node data
-    /// \param Material node coordinates
-    /// \param Material nodes in the material element
-    /// \param Material to mesh map for elements
-    /// \param number of material nodes
-    /// \param number of material elements
-    /// \param number of nodes in the element
-    /// \param number of dimensions 
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void build_material_elem_node_lists(
-        const Mesh_t& mesh,
-        const DCArrayKokkos<double>& state_node_coords,
-        DCArrayKokkos<double>& mat_node_coords,
-        DCArrayKokkos <size_t>& mat_nodes_in_mat_elem,
-        const DRaggedRightArrayKokkos<size_t>& elem_in_mat_elem,
-        const size_t mat_id,
-        size_t& num_mat_nodes,
-        const size_t num_mat_elems,
-        const size_t num_nodes_in_elem,
-        const size_t num_dims)
-    {
-
-        // helper arrays
-        DCArrayKokkos <size_t> dummy_counter(mesh.num_nodes, "dummy_counter");
-        DCArrayKokkos <size_t> access_mat_node_gids(mesh.num_nodes, "access_mat_node_gids");
-        dummy_counter.set_values(0);
-
-        // tag and count the number of nodes in this part
-        FOR_ALL (mat_elem_sid, 0, num_mat_elems, {
-            // get elem gid
-            size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);  // WARNING not GPU compatible
-            
-            // parallel loop over the nodes in the element
-            for(size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++) {
-                size_t node_gid = mesh.nodes_in_elem(elem_gid, node_lid);
-
-                Kokkos::atomic_add(&dummy_counter(node_gid), 1); // values in part will be >0
-
-            } // end for nodes in element
-            
-        }); // end parallel for
-        Kokkos::fence();
-        dummy_counter.update_host();
-
-        // loop opperation is not thread safe, must be run serially
-        size_t mat_node_gid = 0;
-        for(size_t node_gid = 0; node_gid<mesh.num_nodes; node_gid ++) {
-            
-            // save the nodes on the part (i.e., that belong to the material)
-            if (dummy_counter.host(node_gid)>0){
-                mat_node_coords.host(mat_node_gid, 0) = state_node_coords.host(node_gid, 0);
-                mat_node_coords.host(mat_node_gid, 1) = state_node_coords.host(node_gid, 1);
-                if (num_dims == 3){ 
-                    mat_node_coords.host(mat_node_gid, 2) = state_node_coords.host(node_gid, 2);
-                } // end if on dims
-
-                access_mat_node_gids.host(node_gid) = mat_node_gid; // the part node id
-
-                mat_node_gid ++;
-
-                dummy_counter.host(node_gid) = 0; // set counter to zero, it was accounted for
-            } // end if this node is on the part
-
-        } // end loop over all mesh nodes
-        mat_node_coords.update_device();
-        access_mat_node_gids.update_device();
-        dummy_counter.update_device();
-        Kokkos::fence();
-
-        // save the number of nodes defining the material region, i.e., the part
-        num_mat_nodes = mat_node_gid;
-        
-        // save the new node id's
-        FOR_ALL (mat_elem_sid, 0, num_mat_elems, {
-            // get elem gid
-            size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid);
-            
-            // parallel loop over the nodes in the element
-            for(size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++) {
-                size_t node_gid = mesh.nodes_in_elem(elem_gid, node_lid);
-
-                // save the mat_node to the mat elem list
-                mat_nodes_in_mat_elem(mat_elem_sid, node_lid) = access_mat_node_gids(node_gid);
-
-            } // end for nodes in element
-            
-        }); // end parallel for
-        Kokkos::fence();
-        mat_nodes_in_mat_elem.update_host();
-
-    } // end build part (i.e., material elem and point lists) function
-
-
-
-
-
-    /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_material_point_state
-    ///
-    /// \brief Writes a state output file at each material point
-    ///
-    /// \param Simulation mesh
-    /// \param State data
-    /// \param Simulation parameters
-    /// \param current time value
-    /// \param Vector of all graphics output times
-    ///
-    /////////////////////////////////////////////////////////////////////////////
-    void write_material_point_state(Mesh_t& mesh,
-        State_t& State,
-        SimulationParameters_t& SimulationParamaters,
-        double time_value,
-        CArray<double> graphics_times,
-        std::vector<node_state> node_states,
-        std::vector<gauss_pt_state> gauss_pt_states,
-        std::vector<material_pt_state> material_pt_states)
-    {
-        // WARNING WARNING WARNING:
-        // This currently assumes the gauss and material point IDs are the same as the element ID
-        // This will need to be updated for high order methods
-
-        // Update host data
-        // ---- Update host data ----
-        size_t num_mats = State.MaterialPoints.num_material_points.size();
-
-        State.MaterialPoints.den.update_host();
-        State.MaterialPoints.pres.update_host();
-        State.MaterialPoints.stress.update_host();
-        State.MaterialPoints.sspd.update_host();
-        State.MaterialPoints.sie.update_host();
-        State.MaterialPoints.mass.update_host();
-
-        State.GaussPoints.vol.update_host();
-
-        State.node.coords.update_host();
-        State.node.vel.update_host();
-        State.node.mass.update_host();
-
-        Kokkos::fence();
-
-        struct stat st;
-
-        if (stat("state", &st) != 0) {
-            system("mkdir state");
-        }
-
-        size_t num_dims = mesh.num_dims;
-
-        //  ---------------------------------------------------------------------------
-        //  Setup of file and directory for exporting
-        //  ---------------------------------------------------------------------------
-
-        // output file
-        FILE* out_elem_state;  // element average state
-        char  filename[128];
-
-        int max_len = sizeof filename;
-
-        snprintf(filename, max_len, "state/mat_pt_state_t_%6.4e.txt", time_value);
-
-        // output files
-        out_elem_state = fopen(filename, "w");
-
-        // write state dump
-        fprintf(out_elem_state, "# state dump file\n");
-        fprintf(out_elem_state, "# x  y  z  radius_2D  radius_3D  den  pres  sie  sspd  vol  mass \n");
-
-        // write out values for the elem
-        for (size_t mat_id = 0; mat_id < num_mats; mat_id++) {
-
-            size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id);
-
-            for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++)
-            {
-
-                const size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid);
-
-                double elem_coords[3];
-                elem_coords[0] = 0.0;
-                elem_coords[1] = 0.0;
-                elem_coords[2] = 0.0;
-
-                // get the coordinates of the element center
-                for (size_t node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) {
-
-                    elem_coords[0] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 0);
-                    elem_coords[1] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 1);
-                    if (num_dims == 3) {
-                        elem_coords[2] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 2);
-                    }
-                    else{
-                        elem_coords[2] = 0.0;
-                    }
-                } // end loop over nodes in element
-
-                elem_coords[0] = elem_coords[0] / ((double)mesh.num_nodes_in_elem);
-                elem_coords[1] = elem_coords[1] / ((double)mesh.num_nodes_in_elem);
-                elem_coords[2] = elem_coords[2] / ((double)mesh.num_nodes_in_elem);
-
-                double rad2 = sqrt(elem_coords[0] * elem_coords[0] +
-                                   elem_coords[1] * elem_coords[1]);
-
-                double rad3 = sqrt(elem_coords[0] * elem_coords[0] +
-                                   elem_coords[1] * elem_coords[1] +
-                                   elem_coords[2] * elem_coords[2]);
-
-
-                fprintf(out_elem_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t \n",
-                         elem_coords[0],
-                         elem_coords[1],
-                         elem_coords[2],
-                         rad2,
-                         rad3,
-                         State.MaterialPoints.den.host(mat_id, mat_elem_sid),
-                         State.MaterialPoints.pres.host(mat_id, mat_elem_sid),
-                         State.MaterialPoints.sie.host(mat_id, mat_elem_sid),
-                         State.MaterialPoints.sspd.host(mat_id, mat_elem_sid),
-                         State.GaussPoints.vol.host(elem_gid),
-                         State.MaterialPoints.mass.host(mat_id, mat_elem_sid) );
-
-            } // end for elements
-
-        } // end for materials
-        fclose(out_elem_state);
-
-
-
-        // printing nodal state
-            
-        FILE* out_point_state;  // element average state
-
-        snprintf(filename, max_len, "state/node_state_t_%6.4e.txt", time_value);
-
-        // output files
-        out_point_state = fopen(filename, "w");
-
-        // write state dump
-        fprintf(out_point_state, "# state node dump file\n");
-        fprintf(out_point_state, "# x  y  z  radius_2D  radius_3D  vel_x  vel_y  vel_z  speed  ||err_v_dot_r|| \n");
-
-        // get the coordinates of the node
-        for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
-
-            double node_coords[3];
-
-            node_coords[0] = State.node.coords.host(node_gid, 0);
-            node_coords[1] = State.node.coords.host(node_gid, 1);
-            if (num_dims == 3) {
-                node_coords[2] = State.node.coords.host(node_gid, 2);
-            }
-            else{
-                node_coords[2] = 0.0;
-            }
-
-            double rad2 = sqrt(node_coords[0] * node_coords[0] +
-                               node_coords[1] * node_coords[1]);
-            double rad3 = sqrt(node_coords[0] * node_coords[0] +
-                               node_coords[1] * node_coords[1] +
-                               node_coords[2] * node_coords[2]);
-
-            double node_vel[3];
-
-           node_vel[0] = State.node.vel.host(node_gid, 0);
-           node_vel[1] = State.node.vel.host(node_gid, 1);
-            if (num_dims == 3) {
-               node_vel[2] = State.node.vel.host(node_gid, 2);
-            }
-            else{
-               node_vel[2] = 0.0;
-            }
-
-            double speed = sqrt(node_vel[0] * node_vel[0] +
-                                node_vel[1] * node_vel[1] +
-                                node_vel[2] * node_vel[2]);
-
-
-
-            // looking at perfect radial motion
-            double unit_r_vec[2];
-            unit_r_vec[0] = node_coords[0]/rad2;
-            unit_r_vec[1] = node_coords[1]/rad2;
-
-            //the radial motion
-            double v_dot_r = node_vel[0] * unit_r_vec[0] +
-                             node_vel[1] * unit_r_vec[1];
-            
-
-            double err_v_dot_r[3]; 
-            err_v_dot_r[0] = node_vel[0]-unit_r_vec[0]*v_dot_r;
-            err_v_dot_r[1] = node_vel[1]-unit_r_vec[1]*v_dot_r;
-
-            double mag_err_v_dot_r = sqrt(err_v_dot_r[0]*err_v_dot_r[0] + err_v_dot_r[1]*err_v_dot_r[1]);
-
-            fprintf(out_point_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t  %4.12e\t %4.12e\t \n",
-                         node_coords[0],
-                         node_coords[1],
-                         node_coords[2],
-                         rad2,
-                         rad3,
-                         node_vel[0],
-                         node_vel[1],
-                         node_vel[2],
-                         speed,
-                         mag_err_v_dot_r);
-
-
-        } // end loop over nodes in element
-
+    // update device side
+    mesh.nodes_in_elem.update_device();
 
-        fclose(out_point_state);
 
 
-        return;
-    } // end of state output
-}; // end class
+    // Build connectivity
+    mesh.build_connectivity();
+} // end build_3d_box
 
-#endif // end Header Guard
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
new file mode 100644
index 00000000..7a1cb676
--- /dev/null
+++ b/examples/mesh_decomp/state.h
@@ -0,0 +1,139 @@
+/**********************************************************************************************
+� 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+#ifndef STATE_H
+#define STATE_H
+
+#include "matar.h"
+
+using namespace mtr;
+
+
+// Possible node states, used to initialize node_t
+enum class node_state
+{
+    coords
+};
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct node_t
+///
+/// \brief Stores state information associated with a node
+///
+/////////////////////////////////////////////////////////////////////////////
+struct node_t
+{
+    DCArrayKokkos<double> coords;     ///< Nodal coordinates
+    DCArrayKokkos<double> coords_n0;  ///< Nodal coordinates at tn=0 of time integration
+    
+    // initialization method (num_nodes, num_dims, state to allocate)
+    void initialize(size_t num_nodes, size_t num_dims, std::vector<node_state> node_states)
+    {
+        for (auto field : node_states){
+            switch(field){
+                case node_state::coords:
+                    if (coords.size() == 0) this->coords = DCArrayKokkos<double>(num_nodes, num_dims, "node_coordinates");
+                    if (coords_n0.size() == 0) this->coords_n0 = DCArrayKokkos<double>(num_nodes, num_dims, "node_coordinates_n0");
+                    break;
+                default:
+                    std::cout<<"Desired node state not understood in node_t initialize"<<std::endl;
+                    throw std::runtime_error("**** Error in State Field Name ****");
+            }
+        }
+    }; // end method
+
+}; // end node_t
+
+
+// Possible gauss point states, used to initialize GaussPoint_t
+enum class gauss_pt_state
+{
+    volume
+};
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct GaussPoint_t
+///
+/// \brief Stores state information associated with the Gauss point
+///
+/////////////////////////////////////////////////////////////////////////////
+struct GaussPoint_t
+{
+
+    DCArrayKokkos<double> vol;  ///< GaussPoint volume
+    
+
+    // initialization method (num_cells, num_dims)
+    void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector<gauss_pt_state> gauss_pt_states)
+    {
+
+        for (auto field : gauss_pt_states){
+            switch(field){
+                case gauss_pt_state::volume:
+                    if (vol.size() == 0) this->vol = DCArrayKokkos<double>(num_gauss_pnts, "gauss_point_volume");
+                    break;
+                default:
+                    std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"<<std::endl;
+                    throw std::runtime_error("**** Error in State Field Name ****");
+            }
+        }
+    }; // end method
+};  // end GaussPoint_t
+
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct state_t
+///
+/// \brief Stores all state
+///
+/////////////////////////////////////////////////////////////////////////////
+struct State_t
+{
+    // ---------------------------------------------------------------------
+    //    state data on mesh declarations
+    // ---------------------------------------------------------------------
+    node_t node;              ///< access as node.coords(node_gid,dim)
+    GaussPoint_t GaussPoints; ///< access as GaussPoints.vol(gauss_pt_gid)
+    
+}; // end state_t
+
+
+
+
+
+#endif
\ No newline at end of file
diff --git a/scripts/build-matar.sh b/scripts/build-matar.sh
index 30f384dc..fa95bc12 100755
--- a/scripts/build-matar.sh
+++ b/scripts/build-matar.sh
@@ -71,7 +71,7 @@ show_help() {
 build_action="full-app"
 execution="examples"
 machine="linux"
-kokkos_build_type="serial"
+kokkos_build_type="openmp"
 build_cores="1"
 trilinos="disabled"
 intel_mkl="disabled"

From b88147557008077f3d253f35a248b7683c733b9e Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 22 Oct 2025 20:08:00 -0500
Subject: [PATCH 03/52] ENH: Adding mesh decomposition example WIP, Initial
 decomposition nearly done

---
 examples/mesh_decomp/mesh.h          |   7 +
 examples/mesh_decomp/mesh_decomp.cpp | 338 ++++++++++++++++++++++++++-
 2 files changed, 335 insertions(+), 10 deletions(-)

diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
index 9a7140a3..6d1e31d7 100644
--- a/examples/mesh_decomp/mesh.h
+++ b/examples/mesh_decomp/mesh.h
@@ -303,6 +303,13 @@ struct Mesh_t
     RaggedRightArrayKokkos<size_t> bdy_nodes_in_set; ///< Boundary nodes in a boundary set
     DCArrayKokkos<size_t> num_bdy_nodes_in_set; ///< Number of boundary nodes in a set
 
+
+    // MPI Decomposition Data Definitions ---- //
+    DCArrayKokkos<size_t> local_to_global_node_mapping; ///< Local to global node mapping
+    
+    DCArrayKokkos<size_t> local_to_global_elem_mapping; ///< Local to global element mapping
+
+
     // initialization methods
     void initialize_nodes(const size_t num_nodes_inp)
     {
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 595ab4e0..1aa70f16 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -4,6 +4,7 @@
 #include <vector>
 #include <memory>
 #include <mpi.h>
+#include <set>
 
 
 #include "mesh.h"
@@ -15,6 +16,33 @@
 #include "ptscotch.h"
 
 
+void calc_elements_per_rank(std::vector<int>& elems_per_rank, int num_elems, int world_size){
+    // Compute elements to send to each rank; handle remainders for non-even distribution
+    std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size);
+    int remainder = num_elems % world_size;
+    for (int i = 0; i < remainder; ++i) {
+        elems_per_rank[i] += 1;
+    }
+}
+
+void print_mesh_info(Mesh_t& mesh){
+    std::cout<<"Mesh has "<<mesh.num_elems<<" elements"<<std::endl;
+    std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
+
+    for (int i = 0; i < mesh.num_elems; i++) {
+        std::cout<<"Element "<<i<<" has nodes: ";
+        for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
+            std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
+        }
+        std::cout<<std::endl;
+    }
+    std::cout<<std::endl;
+}
+
+struct Decomp_data_t{
+
+};
+
 
 
 int main(int argc, char** argv) {
@@ -29,41 +57,331 @@ int main(int argc, char** argv) {
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
 
+    // Initial mesh size
+    double origin[3] = {0.0, 0.0, 0.0};
+    double length[3] = {1.0, 1.0, 1.0};
+    int num_elems_dim[3] = {2, 2, 2};
+
+    Mesh_t initial_mesh;
+    GaussPoint_t initial_GaussPoints;
+    node_t initial_node;
 
     // Create mesh, gauss points, and node data structures on each rank
     Mesh_t mesh;
     GaussPoint_t GaussPoints;
     node_t node;
 
+    int num_elements_on_rank = 0;
+    int num_nodes_on_rank = 0;
+
+    std::vector<int> elements_on_rank;  
+    std::vector<int> nodes_on_rank;
+
+
+    std::vector<int> elems_per_rank(world_size);
+    std::vector<int> nodes_per_rank(world_size);
+
+    // create a 2D vector of elements to send to each rank
+    std::vector<std::vector<int>> elements_to_send(world_size);
+
+    // create a 2D vector of nodes to send to each rank
+    std::vector<std::vector<int>> nodes_to_send(world_size);
 
     if (rank == 0) {
-        std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
         std::cout<<"World size: "<<world_size<<std::endl;
+        std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
+        
+        std::cout<<"Initializing mesh"<<std::endl;
+        build_3d_box(initial_mesh, initial_GaussPoints, initial_node, origin, length, num_elems_dim);
+
+        // print out the nodes associated with each element in the initial mesh
+        print_mesh_info(initial_mesh);
 
-        Mesh_t initial_mesh;
-        GaussPoint_t initial_GaussPoints;
-        node_t initial_node;
+        // Compute elements to send to each rank; handle remainders for non-even distribution
+        calc_elements_per_rank(elems_per_rank, initial_mesh.num_elems, world_size);
+    }
     
-        double origin[3] = {0.0, 0.0, 0.0};
-        double length[3] = {1.0, 1.0, 1.0};
-        int num_elems[3] = {10, 10, 10};
+    // All ranks participate in the scatter operation
+    // MPI_Scatter signature:
+    // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+    //             void *recvbuf, int recvcount, MPI_Datatype recvtype,
+    //             int root, MPI_Comm comm)
+    MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
+                &num_elements_on_rank, 1, MPI_INT, 
+                0, MPI_COMM_WORLD);
     
-        std::cout<<"Initializing mesh"<<std::endl;
-        build_3d_box(initial_mesh, initial_GaussPoints, initial_node, origin, length, num_elems);
+    MPI_Barrier(MPI_COMM_WORLD);
+    std::cout << "Rank " << rank << " received " << num_elements_on_rank << " elements" << std::endl;
 
+    // All ranks participate in the scatterv operation
+    // Resize the elements_on_rank vector to hold the received data
+    elements_on_rank.resize(num_elements_on_rank);
+    
+    if (rank == 0) {
 
+        //print elements per rank
+        std::cout<<std::endl;
+        int elem_gid = 0;
+        for (int i = 0; i < world_size; i++) {
+            std::cout<<std::endl;
+            std::cout<<"Rank "<<i<<" will get "<<elems_per_rank[i]<<" elements: ";
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                std::cout<<elem_gid<<" ";
+                elements_to_send[i].push_back(elem_gid);
+                elem_gid++;
+            }
+        }
+        std::cout<<std::endl;
 
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D elements_to_send into a 1D array
+        std::vector<int> all_elements;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = elems_per_rank[i];
+            displs[i] = displacement;
+            // Copy elements for rank i to the flattened array
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                all_elements.push_back(elements_to_send[i][j]);
+            }
+            displacement += elems_per_rank[i];
+        }
 
+        // Send the elements to each rank
+        MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     elements_on_rank.data(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    } 
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     elements_on_rank.data(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
     }
+    
 
+    MPI_Barrier(MPI_COMM_WORLD);
 
+    std::cout << "Rank " << rank << " received elements: ";
+    for (int i = 0; i < num_elements_on_rank; i++) {
+        std::cout << elements_on_rank[i] << " ";
+    }
+    std::cout << std::endl;
     
+    if (rank == 0) {
+
+        // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
+        for (int i = 0; i < world_size; i++) {      
+            std::set<int> nodes_set;
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                for (int k = 0; k < 8; k++) {
+                    nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
+                }
+            }
+            nodes_to_send[i] = std::vector<int>(nodes_set.begin(), nodes_set.end());
+        }  
 
+        for (int i = 0; i < world_size; i++) {
+            nodes_per_rank[i] = nodes_to_send[i].size();
+        }
+    }
+
+    // Send the number of nodes to each rank using MPI_scatter
+    MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT,
+    &num_nodes_on_rank, 1, MPI_INT,
+    0, MPI_COMM_WORLD); 
+
+    MPI_Barrier(MPI_COMM_WORLD);
 
+    std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
 
+    nodes_on_rank.resize(num_nodes_on_rank);
+
+    if (rank == 0) {
+
+        // print the nodes_to_send array
+        for (int i = 0; i < world_size; i++) {
+            std::cout<<std::endl;
+            std::cout<<"Rank "<<i<<" will get "<<nodes_to_send[i].size()<<" nodes: ";
+            for (int j = 0; j < nodes_to_send[i].size(); j++) {
+                std::cout<<nodes_to_send[i][j]<<" ";
+            }
+            std::cout<<std::endl;
+        }
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D nodes_to_send into a 1D array
+        std::vector<int> all_nodes;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = nodes_to_send[i].size();
+            displs[i] = displacement;
+            // Copy nodes for rank i to the flattened array
+            for (int j = 0; j < nodes_to_send[i].size(); j++) {
+                all_nodes.push_back(nodes_to_send[i][j]);
+            }
+            displacement += nodes_to_send[i].size();
+        }
+        // Send the nodes to each rank
+        // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank
+        // sendcounts.data(): Array with the number of nodes to send to each rank
+        // displs.data(): Array with the displacement for each rank in the flattened array
+        // MPI_INT: Data type of the nodes (integer)
+        // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes
+        // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive
+        // MPI_INT: Data type of the receive buffer (integer)
+        // 0: The root rank (rank 0) that is performing the scatter
+        // MPI_COMM_WORLD: The communicator
+        MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+            nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
+            0, MPI_COMM_WORLD);
+
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    std::cout << "Rank " << rank << " received nodes: ";
+    for (int i = 0; i < num_nodes_on_rank; i++) {
+        std::cout << nodes_on_rank[i] << " ";
+    }
+    std::cout << std::endl;
 
-    if (rank == 0) std::cout<<"Finished decomposition"<<std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    
+
+
+    // Send the element-node connectivity data from the initial mesh to each rank
+    std::vector<int> nodes_in_elem_on_rank;
+    
+    // All ranks need to resize their receive buffer
+    nodes_in_elem_on_rank.resize(num_elements_on_rank * 8);
     
+    if (rank == 0) {
+        // Prepare element-node connectivity data for each rank
+        std::vector<int> all_nodes_in_elem;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for(int i = 0; i < world_size; i++) {
+            int num_connectivity_entries = elements_to_send[i].size() * 8; // 8 nodes per element
+            sendcounts[i] = num_connectivity_entries;
+            displs[i] = displacement;
+            
+            // Copy element-node connectivity for rank i
+            for(int j = 0; j < elements_to_send[i].size(); j++) {
+                for(int k = 0; k < 8; k++) {
+                    all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
+                }
+            }
+            displacement += num_connectivity_entries;
+        }
+        
+        // Send the connectivity data to each rank
+        MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) {
+
+        std::cout << "Rank " << rank << " received element-node connectivity (" 
+                << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " nodes: ";
+            for (int node = 0; node < 8; node++) {
+                int idx = elem * 8 + node;
+                std::cout << nodes_in_elem_on_rank[idx] << " ";
+            }
+            std::cout << std::endl;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 1) {
+
+        std::cout << "Rank " << rank << " received element-node connectivity (" 
+                << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " nodes: ";
+            for (int node = 0; node < 8; node++) {
+                int idx = elem * 8 + node;
+                std::cout << nodes_in_elem_on_rank[idx] << " ";
+            }
+            std::cout << std::endl;
+        }
+    }
+
+    mesh.initialize_nodes(num_nodes_on_rank);
+
+    std::vector<node_state> required_node_state = { node_state::coords };
+
+
+    mesh.initialize_elems(num_elements_on_rank, 3);
+
+
+    // WARNING WARNING WARNING: THIS IS WRONG< SHOULD BE LOCAL ID.  Figure this out
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        for(int j = 0; j < 8; j++) {
+            mesh.nodes_in_elem.host(i, j) = nodes_in_elem_on_rank[i * 8 + j];
+        }
+    }
+
+    mesh.nodes_in_elem.update_device();
+
+
+    mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_nodes_on_rank, "mesh.local_to_global_node_mapping");
+    mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_elements_on_rank, "mesh.local_to_global_elem_mapping");
+
+    for(int i = 0; i < num_nodes_on_rank; i++) {
+        mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i];
+    }   
+
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i];
+    }
+
+    mesh.local_to_global_node_mapping.update_device();
+    mesh.local_to_global_elem_mapping.update_device();
+    // in kernel, I will do the following
+        // On each rank, I need:
+        // 1. Numnber of nodes
+        // 2. node coordinates
+        // 3. number of elements
+        // 5. Local node to global node mapping
+        // 6. Local element to global element mapping
+        // 7. Element-node connectivity
+        //  With the above, I can call build connectivity on the local mesh
+
+
+
+    // elements_on_rank is now received via MPI_Scatterv above
+
+   
+
+
+    // if (rank == 0) std::cout<<"Finished"<<std::endl;
+
     } // end MATAR scope
     MATAR_FINALIZE();
     MPI_Finalize();

From edb9dc4251752deb7736c003777aaa631d0fbb65 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 27 Oct 2025 11:58:17 -0500
Subject: [PATCH 04/52] ENH: Tidying up initial decomposition

---
 examples/mesh_decomp/mesh.h          |  15 +-
 examples/mesh_decomp/mesh_decomp.cpp | 391 +++++++++++++++++++++------
 examples/mesh_decomp/mesh_io.h       | 331 ++++++++++++++++++++++-
 3 files changed, 640 insertions(+), 97 deletions(-)

diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
index 6d1e31d7..0011d2e8 100644
--- a/examples/mesh_decomp/mesh.h
+++ b/examples/mesh_decomp/mesh.h
@@ -221,6 +221,8 @@ struct Mesh_t
     // Patch: A discretization of a surface by subdividing the surface using the nodes
     // Corner: A element-node pair
 
+    bool verbose = false;
+
     // ---- Global Mesh Definitions ---- //
     mesh_init::elem_name_tag elem_kind = mesh_init::linear_tensor_element; ///< The type of elements used in the mesh
 
@@ -308,6 +310,7 @@ struct Mesh_t
     DCArrayKokkos<size_t> local_to_global_node_mapping; ///< Local to global node mapping
     
     DCArrayKokkos<size_t> local_to_global_elem_mapping; ///< Local to global element mapping
+    
 
 
     // initialization methods
@@ -550,7 +553,7 @@ struct Mesh_t
 
         // DCArrayKokkos <size_t> gauss_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_gauss_in_patch);
 
-        printf("Number of dimensions = %zu \n", num_dims);
+        if (verbose) printf("Number of dimensions = %zu \n", num_dims);
 
         if (num_dims == 3) {
             // num_patches_in_surf = [1^2, 2^2, 3^2, 4^2, ... , Pn^2]
@@ -973,7 +976,7 @@ struct Mesh_t
         node_ordering_in_elem.update_device();
         Kokkos::fence();
 
-        printf("Built node ordering \n");
+        if (verbose) printf("Built node ordering \n");
 
         // for saving the hash keys of the patches and then the neighboring elem_gid
         CArrayKokkos<int> hash_keys_in_elem(num_elems, num_patches_in_elem, num_nodes_in_patch, "hash_keys_in_elem"); // always 4 ids in 3D
@@ -1447,16 +1450,16 @@ struct Mesh_t
     void build_connectivity()
     {
         build_corner_connectivity();
-        printf("Built corner connectivity \n");
+        if (verbose) printf("Built corner connectivity \n");
 
         build_elem_elem_connectivity();
-        printf("Built element-element connectivity \n");
+        if (verbose) printf("Built element-element connectivity \n");
 
         build_patch_connectivity();
-        printf("Built patch connectivity \n");
+        if (verbose) printf("Built patch connectivity \n");
 
         build_node_node_connectivity();
-        printf("Built node-node connectivity \n");
+        if (verbose) printf("Built node-node connectivity \n");
     }
 
     /////////////////////////////////////////////////////////////////////////////
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 1aa70f16..5b6635b7 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -15,6 +15,55 @@
 #include "scotch.h"
 #include "ptscotch.h"
 
+// Timer class for timing the execution of the matrix multiplication
+class Timer {
+    private:
+        std::chrono::high_resolution_clock::time_point start_time;
+        std::chrono::high_resolution_clock::time_point end_time;
+        bool is_running;
+    
+    public:
+        Timer() : is_running(false) {}
+        
+        void start() {
+            start_time = std::chrono::high_resolution_clock::now();
+            is_running = true;
+        }
+        
+        double stop() {
+            if (!is_running) {
+                std::cerr << "Timer was not running!" << std::endl;
+                return 0.0;
+            }
+            end_time = std::chrono::high_resolution_clock::now();
+            is_running = false;
+            
+            auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+            return duration.count() / 1000.0; // Convert to milliseconds
+        }
+};
+
+void print_rank_mesh_info(Mesh_t& mesh, int rank) {
+
+    std::cout<<std::endl;
+    std::cout<<"Rank "<<rank<<" printing mesh info"<<std::endl;
+    std::cout<<"Mesh has "<<mesh.num_elems<<" elements"<<std::endl;
+    std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
+
+    for (int i = 0; i < mesh.num_elems; i++) {
+        std::cout<<"Element "<<i<<" has nodes: ";
+        for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
+            std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
+        }
+        std::cout<<std::endl;
+        std::cout<<"Which have global indices of : ";
+        for (int k = 0; k < mesh.num_nodes_in_elem; k++) {
+            std::cout<<mesh.local_to_global_node_mapping.host(mesh.nodes_in_elem.host(i, k))<<" ";
+        }
+        std::cout<<std::endl;
+    }
+    std::cout<<std::endl;
+}
 
 void calc_elements_per_rank(std::vector<int>& elems_per_rank, int num_elems, int world_size){
     // Compute elements to send to each rank; handle remainders for non-even distribution
@@ -39,13 +88,16 @@ void print_mesh_info(Mesh_t& mesh){
     std::cout<<std::endl;
 }
 
-struct Decomp_data_t{
 
-};
+int main(int argc, char** argv) {
 
+    // Create and start timer
+    Timer timer;
+    timer.start();
 
+    bool print_info = true;
+    bool print_vtk = true;
 
-int main(int argc, char** argv) {
 
     MPI_Init(&argc, &argv);
     MATAR_INITIALIZE(argc, argv);
@@ -74,12 +126,14 @@ int main(int argc, char** argv) {
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
 
+    int num_nodes_per_elem = 0;
+
     std::vector<int> elements_on_rank;  
     std::vector<int> nodes_on_rank;
 
 
-    std::vector<int> elems_per_rank(world_size);
-    std::vector<int> nodes_per_rank(world_size);
+    std::vector<int> elems_per_rank(world_size); // number of elements to send to each rank size(world_size)
+    std::vector<int> nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size)
 
     // create a 2D vector of elements to send to each rank
     std::vector<std::vector<int>> elements_to_send(world_size);
@@ -87,6 +141,16 @@ int main(int argc, char** argv) {
     // create a 2D vector of nodes to send to each rank
     std::vector<std::vector<int>> nodes_to_send(world_size);
 
+    // Create a 2D vector to hold the nodal positions on each rank
+    std::vector<std::vector<double>> node_pos_to_send(world_size);
+
+    // create a 2D vector to hold the node positions on each rank
+    std::vector<std::vector<double>> node_pos_on_rank(world_size);
+
+
+// ********************************************************  
+//              Build the initial mesh
+// ********************************************************  
     if (rank == 0) {
         std::cout<<"World size: "<<world_size<<std::endl;
         std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
@@ -94,13 +158,25 @@ int main(int argc, char** argv) {
         std::cout<<"Initializing mesh"<<std::endl;
         build_3d_box(initial_mesh, initial_GaussPoints, initial_node, origin, length, num_elems_dim);
 
+        num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
+
         // print out the nodes associated with each element in the initial mesh
-        print_mesh_info(initial_mesh);
+        if (print_info) {
+            print_mesh_info(initial_mesh);
+        }
 
         // Compute elements to send to each rank; handle remainders for non-even distribution
         calc_elements_per_rank(elems_per_rank, initial_mesh.num_elems, world_size);
     }
+
+    // int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
+    MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
+
+    MPI_Barrier(MPI_COMM_WORLD);
     
+// ********************************************************  
+//        Scatter the number of elements to each rank
+// ******************************************************** 
     // All ranks participate in the scatter operation
     // MPI_Scatter signature:
     // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
@@ -111,27 +187,39 @@ int main(int argc, char** argv) {
                 0, MPI_COMM_WORLD);
     
     MPI_Barrier(MPI_COMM_WORLD);
-    std::cout << "Rank " << rank << " received " << num_elements_on_rank << " elements" << std::endl;
 
-    // All ranks participate in the scatterv operation
     // Resize the elements_on_rank vector to hold the received data
     elements_on_rank.resize(num_elements_on_rank);
     
+
+
+
+// ********************************************************  
+//     Scatter the actual element global ids to each rank
+// ******************************************************** 
     if (rank == 0) {
 
         //print elements per rank
         std::cout<<std::endl;
         int elem_gid = 0;
         for (int i = 0; i < world_size; i++) {
-            std::cout<<std::endl;
-            std::cout<<"Rank "<<i<<" will get "<<elems_per_rank[i]<<" elements: ";
+
             for (int j = 0; j < elems_per_rank[i]; j++) {
-                std::cout<<elem_gid<<" ";
                 elements_to_send[i].push_back(elem_gid);
                 elem_gid++;
             }
         }
-        std::cout<<std::endl;
+
+        if (print_info) {
+            for (int i = 0; i < world_size; i++) {
+                std::cout<<std::endl;
+                std::cout<<"Rank "<<i<<" will get "<<elems_per_rank[i]<<" elements: ";
+                for (int j = 0; j < elems_per_rank[i]; j++) {
+                    std::cout<<elements_to_send[i][j]<<" ";
+                }
+            }
+            std::cout<<std::endl;
+        }
 
         // Prepare data for MPI_Scatterv (scatter with variable counts)
         // Flatten the 2D elements_to_send into a 1D array
@@ -164,52 +252,71 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    std::cout << "Rank " << rank << " received elements: ";
-    for (int i = 0; i < num_elements_on_rank; i++) {
-        std::cout << elements_on_rank[i] << " ";
+    if (print_info) {
+        std::cout << "Rank " << rank << " received elements: ";
+        for (int i = 0; i < num_elements_on_rank; i++) {
+            std::cout << elements_on_rank[i] << " ";
+        }
+        std::cout << std::endl;
     }
-    std::cout << std::endl;
     
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+// ****************************************************************************************** 
+//     Scatter the number of nodes to each rank and compute which nodes to send to each rank
+// ****************************************************************************************** 
+
     if (rank == 0) {
 
         // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
         for (int i = 0; i < world_size; i++) {      
             std::set<int> nodes_set;
             for (int j = 0; j < elems_per_rank[i]; j++) {
-                for (int k = 0; k < 8; k++) {
+                for (int k = 0; k < num_nodes_per_elem; k++) {
                     nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
                 }
             }
             nodes_to_send[i] = std::vector<int>(nodes_set.begin(), nodes_set.end());
-        }  
+        } 
 
-        for (int i = 0; i < world_size; i++) {
-            nodes_per_rank[i] = nodes_to_send[i].size();
+        if (print_info) {
+
+            for (int i = 0; i < world_size; i++) {
+                nodes_per_rank[i] = nodes_to_send[i].size();
+            }
+            std::cout<<std::endl;
+            // print the nodes_to_send array
+            for (int i = 0; i < world_size; i++) {
+
+                std::cout<<std::endl;
+                std::cout<<"Rank "<<i<<" will get "<<nodes_to_send[i].size()<<" nodes: ";
+
+                for (int j = 0; j < nodes_to_send[i].size(); j++) {
+                    std::cout<<nodes_to_send[i][j]<<" ";
+                }
+                std::cout<<std::endl;
+            }
         }
     }
 
     // Send the number of nodes to each rank using MPI_scatter
-    MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT,
-    &num_nodes_on_rank, 1, MPI_INT,
-    0, MPI_COMM_WORLD); 
+    MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
+    if (print_info) {
+        std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
+    }
 
+    // resize the nodes_on_rank vector to hold the received data
     nodes_on_rank.resize(num_nodes_on_rank);
 
-    if (rank == 0) {
 
-        // print the nodes_to_send array
-        for (int i = 0; i < world_size; i++) {
-            std::cout<<std::endl;
-            std::cout<<"Rank "<<i<<" will get "<<nodes_to_send[i].size()<<" nodes: ";
-            for (int j = 0; j < nodes_to_send[i].size(); j++) {
-                std::cout<<nodes_to_send[i][j]<<" ";
-            }
-            std::cout<<std::endl;
-        }
+// ****************************************************************************************** 
+//     Scatter the actual node global ids to each rank
+// ****************************************************************************************** 
+    if (rank == 0) {
 
         // Prepare data for MPI_Scatterv (scatter with variable counts)
         // Flatten the 2D nodes_to_send into a 1D array
@@ -251,22 +358,102 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    std::cout << "Rank " << rank << " received nodes: ";
-    for (int i = 0; i < num_nodes_on_rank; i++) {
-        std::cout << nodes_on_rank[i] << " ";
+// ****************************************************************************************** 
+//     Scatter the node positions to each rank
+// ****************************************************************************************** 
+    // Create a flat 1D vector for node positions (3 coordinates per node)
+    std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
+
+    if(rank == 0)
+    {
+        for (int i = 0; i < world_size; i++) {
+            for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++)
+            {
+                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0));
+                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1));
+                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2));
+            }
+        }
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D node_pos_to_send into a 1D array
+        std::vector<double> all_node_pos;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = nodes_to_send[i].size() * 3;
+            displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array
+            // Copy node positions for rank i to the flattened array
+            for(int j = 0; j < nodes_to_send[i].size(); j++) {
+                for(int k = 0; k < 3; k++) {
+                    all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]);
+                }
+            }
+            displacement += nodes_to_send[i].size() * 3;
+        }   
+
+        // Send the node positions to each rank
+        MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE,
+                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE,
+                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0 && print_info) {
+        // Print out the node positions on this rank
+        std::cout << "Rank " << rank << " received node positions: ";
+        for (int i = 0; i < num_nodes_on_rank; i++) {
+            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
+                      << node_pos_on_rank_flat[i*3+1] << ", " 
+                      << node_pos_on_rank_flat[i*3+2] << ") ";
+        }
+        std::cout << std::endl;
     }
-    std::cout << std::endl;
+
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    
+    if (rank == 1 && print_info) {
+        // Print out the node positions on this rank
+        std::cout << "Rank " << rank << " received node positions: ";
+        for (int i = 0; i < num_nodes_on_rank; i++) {
+            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
+                      << node_pos_on_rank_flat[i*3+1] << ", " 
+                      << node_pos_on_rank_flat[i*3+2] << ") ";
+        }
+        std::cout << std::endl;
+    }
 
+// ****************************************************************************************** 
+//     Initialize the node state variables
+// ****************************************************************************************** 
+
+    // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
+    std::vector<node_state> required_node_state = { node_state::coords };
+    node.initialize(num_nodes_on_rank, 3, required_node_state);
+
+    for(int i = 0; i < num_nodes_on_rank; i++) {
+        node.coords.host(i, 0) = node_pos_on_rank_flat[i*3];
+        node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1];
+        node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2];
+    }
+
+    node.coords.update_device();
+
+// ****************************************************************************************** 
+//     Send the element-node connectivity data from the initial mesh to each rank
+// ****************************************************************************************** 
 
     // Send the element-node connectivity data from the initial mesh to each rank
-    std::vector<int> nodes_in_elem_on_rank;
-    
-    // All ranks need to resize their receive buffer
-    nodes_in_elem_on_rank.resize(num_elements_on_rank * 8);
+    std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
     
     if (rank == 0) {
         // Prepare element-node connectivity data for each rank
@@ -276,40 +463,39 @@ int main(int argc, char** argv) {
         
         int displacement = 0;
         for(int i = 0; i < world_size; i++) {
-            int num_connectivity_entries = elements_to_send[i].size() * 8; // 8 nodes per element
+            int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element
             sendcounts[i] = num_connectivity_entries;
             displs[i] = displacement;
             
             // Copy element-node connectivity for rank i
             for(int j = 0; j < elements_to_send[i].size(); j++) {
-                for(int k = 0; k < 8; k++) {
+                for(int k = 0; k < num_nodes_per_elem; k++) {
                     all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
                 }
             }
             displacement += num_connectivity_entries;
         }
-        
         // Send the connectivity data to each rank
         MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT,
+                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
     else {
         MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT,
+                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    if (rank == 0) {
+    if (rank == 0 && print_info) {
 
         std::cout << "Rank " << rank << " received element-node connectivity (" 
                 << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
         for (int elem = 0; elem < num_elements_on_rank; elem++) {
             std::cout << "  Element " << elem << " nodes: ";
-            for (int node = 0; node < 8; node++) {
-                int idx = elem * 8 + node;
+            for (int node = 0; node < num_nodes_per_elem; node++) {
+                int idx = elem * num_nodes_per_elem + node;
                 std::cout << nodes_in_elem_on_rank[idx] << " ";
             }
             std::cout << std::endl;
@@ -318,38 +504,27 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    if (rank == 1) {
+    // if (rank == 1) {
 
-        std::cout << "Rank " << rank << " received element-node connectivity (" 
-                << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " nodes: ";
-            for (int node = 0; node < 8; node++) {
-                int idx = elem * 8 + node;
-                std::cout << nodes_in_elem_on_rank[idx] << " ";
-            }
-            std::cout << std::endl;
-        }
-    }
-
-    mesh.initialize_nodes(num_nodes_on_rank);
-
-    std::vector<node_state> required_node_state = { node_state::coords };
+    //     std::cout << "Rank " << rank << " received element-node connectivity (" 
+    //             << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
+    //     for (int elem = 0; elem < num_elements_on_rank; elem++) {
+    //         std::cout << "  Element " << elem << " nodes: ";
+    //         for (int node = 0; node < num_nodes_per_elem; node++) {
+    //             int idx = elem * num_nodes_per_elem + node;
+    //             std::cout << nodes_in_elem_on_rank[idx] << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    // }
 
 
+// ****************************************************************************************** 
+//     Initialize the mesh data structures for each rank
+// ****************************************************************************************** 
+    mesh.initialize_nodes(num_nodes_on_rank);
     mesh.initialize_elems(num_elements_on_rank, 3);
 
-
-    // WARNING WARNING WARNING: THIS IS WRONG< SHOULD BE LOCAL ID.  Figure this out
-    for(int i = 0; i < num_elements_on_rank; i++) {
-        for(int j = 0; j < 8; j++) {
-            mesh.nodes_in_elem.host(i, j) = nodes_in_elem_on_rank[i * 8 + j];
-        }
-    }
-
-    mesh.nodes_in_elem.update_device();
-
-
     mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_nodes_on_rank, "mesh.local_to_global_node_mapping");
     mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_elements_on_rank, "mesh.local_to_global_elem_mapping");
 
@@ -363,27 +538,63 @@ int main(int argc, char** argv) {
 
     mesh.local_to_global_node_mapping.update_device();
     mesh.local_to_global_elem_mapping.update_device();
-    // in kernel, I will do the following
-        // On each rank, I need:
-        // 1. Numnber of nodes
-        // 2. node coordinates
-        // 3. number of elements
-        // 5. Local node to global node mapping
-        // 6. Local element to global element mapping
-        // 7. Element-node connectivity
-        //  With the above, I can call build connectivity on the local mesh
 
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        for(int j = 0; j < num_nodes_per_elem; j++) {
 
+            int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j];
 
-    // elements_on_rank is now received via MPI_Scatterv above
+            int node_lid = -1;
 
-   
+            // Search through the local to global mapp to find the equivalent local index
+            for(int k = 0; k < num_nodes_on_rank; k++){
+
+                if(node_gid == mesh.local_to_global_node_mapping.host(k)) {
+                    node_lid = k;
+                    break;
+                }
+            }
+
+            mesh.nodes_in_elem.host(i, j) = node_lid;
+        }
+    }
+
+
+    mesh.nodes_in_elem.update_device();
 
+// ****************************************************************************************** 
+//     Build the connectivity for the local mesh
+// ****************************************************************************************** 
 
-    // if (rank == 0) std::cout<<"Finished"<<std::endl;
+    mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    if(rank == 0 && print_info) {
+        print_rank_mesh_info(mesh, rank);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if(rank == 1 && print_info) {
+        print_rank_mesh_info(mesh, rank);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (print_vtk) {
+        write_vtk(mesh, node, rank);
+    }
+
+   
 
     } // end MATAR scope
     MATAR_FINALIZE();
     MPI_Finalize();
+
+     // Stop timer and get execution time
+    double time_ms = timer.stop();
+     
+    printf("Execution time: %.2f ms\n", time_ms);
+
     return 0;
 }
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 0c82ba9d..95db8132 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -7,7 +7,17 @@
 
 using namespace mtr;
 
-
+#include <map>
+#include <memory>
+#include <cstring>
+#include <sys/stat.h>
+#include <iostream>
+#include <regex>    // for string pattern recoginition
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>   
+#include <mpi.h>
 
 
 
@@ -35,6 +45,61 @@ inline int get_id(int i, int j, int k, int num_i, int num_j)
     return i + j * num_i + k * num_i * num_j;
 }
 
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn PointIndexFromIJK
+///
+/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an 
+/// offset into the local connectivity (PointIds) array. The order parameter
+/// must point to an array of 3 integers specifying the order along each 
+/// axis of the hexahedron.
+///
+/////////////////////////////////////////////////////////////////////////////
+inline int PointIndexFromIJK(int i, int j, int k, const int* order)
+{
+    bool ibdy = (i == 0 || i == order[0]);
+    bool jbdy = (j == 0 || j == order[1]);
+    bool kbdy = (k == 0 || k == order[2]);
+    // How many boundaries do we lie on at once?
+    int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0);
+
+    if (nbdy == 3) { // Vertex DOF
+        // ijk is a corner node. Return the proper index (somewhere in [0,7]):
+        return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0);
+    }
+
+    int offset = 8;
+    if (nbdy == 2) { // Edge DOF
+        if (!ibdy) { // On i axis
+            return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
+        }
+        if (!jbdy) { // On j axis
+            return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
+        }
+        // !kbdy, On k axis
+        offset += 4 * (order[0] - 1) + 4 * (order[1] - 1);
+        return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset;
+    }
+
+    offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1);
+    if (nbdy == 1) { // Face DOF
+        if (ibdy) { // On i-normal face
+            return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset;
+        }
+        offset += 2 * (order[1] - 1) * (order[2] - 1);
+        if (jbdy) { // On j-normal face
+            return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset;
+        }
+        offset += 2 * (order[2] - 1) * (order[0] - 1);
+        // kbdy, On k-normal face
+        return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset;
+    }
+
+    // nbdy == 0: Body DOF
+    offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1));
+    return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1)));
+}
+
 /////////////////////////////////////////////////////////////////////////////
 ///
 /// \fn build_3d_box
@@ -163,4 +228,268 @@ void build_3d_box(
     mesh.build_connectivity();
 } // end build_3d_box
 
+
+
+/////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn write_vtk
+    ///
+    /// \brief Writes a vtk output file
+    ///
+    /// \param mesh mesh
+    /// \param node node data
+    /// \param rank rank
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void write_vtk(Mesh_t& mesh,
+        node_t& node,
+        int rank)
+    {
+
+        CArray<double> graphics_times(1);
+        int graphics_id = 0;
+        graphics_times(0) = 0.0;
+
+        // ---- Update host data ----
+
+        // material point values
+        // State.MaterialPoints.den.update_host();
+        // State.MaterialPoints.pres.update_host();
+        // State.MaterialPoints.stress.update_host();
+        // State.MaterialPoints.sspd.update_host();
+        // State.MaterialPoints.sie.update_host();
+        // State.MaterialPoints.mass.update_host();
+        // State.MaterialPoints.conductivity.update_host();
+        // State.MaterialPoints.temp_grad.update_host();
+        // State.MaterialPoints.eroded.update_host();
+
+
+        // gauss point values
+        // State.GaussPoints.vol.update_host();
+
+        // nodal values
+        node.coords.update_host();
+        // State.node.vel.update_host();
+        // State.node.mass.update_host();
+        // State.node.temp.update_host();
+
+        Kokkos::fence();
+
+
+        const int num_cell_scalar_vars = 1;
+        const int num_cell_vec_vars    = 0;
+        const int num_cell_tensor_vars = 0;
+
+        const int num_point_scalar_vars = 1;
+        const int num_point_vec_vars = 1;
+
+
+        // Scalar values associated with a cell
+        const char cell_scalar_var_names[num_cell_scalar_vars][15] = {
+            "rank_id"
+        };
+        
+        // const char cell_vec_var_names[num_cell_vec_vars][15] = {
+            
+        // };
+
+        const char point_scalar_var_names[num_point_scalar_vars][15] = {
+            "rank_id"
+        };
+
+        const char point_vec_var_names[num_point_vec_vars][15] = {
+            "pos"
+        };
+
+        // short hand
+        const size_t num_nodes = mesh.num_nodes;
+        const size_t num_elems = mesh.num_elems;
+        const size_t num_dims  = mesh.num_dims;
+
+        // save the cell state to an array for exporting to graphics files
+        auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
+        int  elem_switch = 1;
+
+
+        // save the output scale fields to a single 2D array
+
+
+        // export material centeric data to the elements
+        elem_fields(0, 0) = rank;
+
+
+        // save the vertex vector fields to an array for exporting to graphics files
+        CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
+        CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
+
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            // position, var 0
+            vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0);
+            vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1);
+            vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
+
+            point_scalar_fields(node_gid, 0) = rank;
+        } // end for loop over vertices
+
+
+        FILE* out[20];   // the output files that are written to
+        char  filename[100]; // char string
+        int   max_len = sizeof filename;
+        int   str_output_len;
+
+        struct stat st;
+
+        if (stat("vtk", &st) != 0) {
+            system("mkdir vtk");
+        }
+
+        // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
+
+        //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id);  // mesh file
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtk", graphics_id, rank);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+         // mesh file
+        
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "# vtk DataFile Version 2.0\n");  // part 2
+        fprintf(out[0], "Mesh for Fierro\n");             // part 2
+        fprintf(out[0], "ASCII \n");                      // part 3
+        fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4
+
+        fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes);
+
+        // write all components of the point coordinates
+        for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+            fprintf(out[0],
+                    "%f %f %f\n",
+                    node.coords.host(node_gid, 0),
+                    node.coords.host(node_gid, 1),
+                    node.coords.host(node_gid, 2));
+        } // end for
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the elems
+        ---------------------------------------------------------------------------
+        */
+
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem);  // size=all printed values
+
+        int Pn_order   = mesh.Pn;
+        int order[3]   = { Pn_order, Pn_order, Pn_order };
+
+        // const int num_1D_points = Pn_order+1;
+
+        // write all global point numbers for this elem
+        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+            fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem
+
+            for (int k = 0; k <= Pn_order; k++) {
+                for (int j = 0; j <= Pn_order; j++) {
+                    for (int i = 0; i <= Pn_order; i++) {
+                        size_t node_lid = PointIndexFromIJK(i, j, k, order);
+                        fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid));
+                    }
+                }
+            }
+
+            fprintf(out[0], "\n");
+        } // end for
+
+        // Write the element types
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems);
+        // VTK_LAGRANGE_HEXAHEDRON: 72,
+        // VTK_HIGHER_ORDER_HEXAHEDRON: 67
+        // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
+        // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
+        // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
+        // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
+        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+            fprintf(out[0], "%d \n", 72);
+        }
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the nodal vector variables to file
+        ---------------------------------------------------------------------------
+        */
+
+        fprintf(out[0], "\n");
+        fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes);
+
+        // vtk vector vars = (position, velocity)
+        for (int var = 0; var < num_point_vec_vars; var++) {
+            fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]);
+            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+                fprintf(out[0], "%f %f %f\n",
+                        vec_fields(node_gid, var, 0),
+                        vec_fields(node_gid, var, 1),
+                        vec_fields(node_gid, var, 2));
+            } // end for nodes
+        } // end for vec_vars
+
+
+        // vtk scalar vars = (temp)
+        for (int var = 0; var < num_point_scalar_vars; var++) {
+            fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]);
+            fprintf(out[0], "LOOKUP_TABLE default\n");
+            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+                fprintf(out[0], "%f\n",
+                        point_scalar_fields(node_gid, 0));
+            } // end for nodes
+        } // end for vec_vars
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the scalar elem variable to file
+        ---------------------------------------------------------------------------
+        */
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems);
+
+        for (int var = 0; var < num_cell_scalar_vars; var++) {
+            fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4]
+            fprintf(out[0], "LOOKUP_TABLE default\n");
+            for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+                fprintf(out[0], "%f\n", rank);
+            } // end for elem
+        } // end for cell scalar_vars
+
+        fclose(out[0]);
+
+        // graphics_times(graphics_id) = time_value;
+
+        // Write time series metadata
+        //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id);  // mesh file
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); 
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+        // mesh file
+
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "{\n");
+        fprintf(out[0], "  \"file-series-version\" : \"1.0\",\n");
+        fprintf(out[0], "  \"files\" : [\n");
+
+        for (int i = 0; i <= graphics_id; i++) {
+            fprintf(out[0], "    { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) );
+        }
+
+        // fprintf(out[0], "%12.5e\n", graphics_times(i));
+        fprintf(out[0], "  ]\n"); // part 4
+        fprintf(out[0], "}"); // part 4
+
+        fclose(out[0]);
+
+        // increment graphics id counter
+        // graphics_id++;
+
+
+    } // end write vtk old
+
+
+
 #endif
\ No newline at end of file

From 3f30bcda93284117e5c819ba203416bfadf095a2 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 27 Oct 2025 15:35:38 -0500
Subject: [PATCH 05/52] ENH: PTScotch now partitioning mesh, WIP

---
 examples/mesh_decomp/CMakeLists.txt      |  18 +-
 examples/mesh_decomp/install_ptscotch.sh |   5 +-
 examples/mesh_decomp/mesh_decomp.cpp     | 316 +++++++++++++++++++++--
 3 files changed, 318 insertions(+), 21 deletions(-)

diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt
index b002a355..7b7306cd 100644
--- a/examples/mesh_decomp/CMakeLists.txt
+++ b/examples/mesh_decomp/CMakeLists.txt
@@ -26,7 +26,19 @@ if (KOKKOS)
   # Add include directories for MPI and Scotch/PT-Scotch
   target_include_directories(mesh_decomp PRIVATE ${MPI_CXX_INCLUDE_PATH} ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/include)
   
-  # Link libraries
-  target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX)
-  target_link_directories(mesh_decomp PRIVATE ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/lib)
+  # Link libraries - order matters! libptscotch depends on libscotch
+  # Use -Wl,--whole-archive to ensure all symbols are included from static libraries
+  # Note: Only link libptscotcherr.a (not libscotcherr.a) to avoid multiple definitions
+  target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX 
+    -Wl,--whole-archive
+    ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libscotch.a
+    -Wl,--no-whole-archive
+    -Wl,--whole-archive
+    ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotcherr.a
+    ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotch.a
+    -Wl,--no-whole-archive
+    -lz     # zlib for gzip compression
+    -lbz2   # bzip2 library
+    -llzma  # xz compression library
+  )
 endif()
diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh
index 00d29df9..29d3f853 100755
--- a/examples/mesh_decomp/install_ptscotch.sh
+++ b/examples/mesh_decomp/install_ptscotch.sh
@@ -27,7 +27,10 @@ cd scotch
 echo "Building Scotch..."
 mkdir build
 cd build
-cmake ..
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+         -DSCOTCH_MPI=ON \
+         -DMPI_C_COMPILER=mpicc \
+         -DMPI_Fortran_COMPILER=mpifort
 make
 
 echo "Installation complete! Libraries installed in: ${INSTALL_PREFIX}"
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 5b6635b7..34a1c683 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -96,7 +96,7 @@ int main(int argc, char** argv) {
     timer.start();
 
     bool print_info = true;
-    bool print_vtk = true;
+    bool print_vtk = false;
 
 
     MPI_Init(&argc, &argv);
@@ -154,7 +154,7 @@ int main(int argc, char** argv) {
     if (rank == 0) {
         std::cout<<"World size: "<<world_size<<std::endl;
         std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
-        
+    
         std::cout<<"Initializing mesh"<<std::endl;
         build_3d_box(initial_mesh, initial_GaussPoints, initial_node, origin, length, num_elems_dim);
 
@@ -294,8 +294,8 @@ int main(int argc, char** argv) {
 
                 for (int j = 0; j < nodes_to_send[i].size(); j++) {
                     std::cout<<nodes_to_send[i][j]<<" ";
-                }
-                std::cout<<std::endl;
+        }
+        std::cout<<std::endl;
             }
         }
     }
@@ -504,20 +504,146 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // if (rank == 1) {
 
-    //     std::cout << "Rank " << rank << " received element-node connectivity (" 
-    //             << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
-    //     for (int elem = 0; elem < num_elements_on_rank; elem++) {
-    //         std::cout << "  Element " << elem << " nodes: ";
-    //         for (int node = 0; node < num_nodes_per_elem; node++) {
-    //             int idx = elem * num_nodes_per_elem + node;
-    //             std::cout << nodes_in_elem_on_rank[idx] << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    // }
+// ****************************************************************************************** 
+//     Send the element-element connectivity data from the initial mesh to each rank
+// ****************************************************************************************** 
+
+    // First, rank 0 computes how many connectivity entries each rank will receive
+    // and scatters that information
+    std::vector<int> elem_elem_counts(world_size);
+    int total_elem_elem_entries = 0;
+    
+    
+    if (rank == 0){
+        // Calculate total number of connectivity entries for each rank
+        for(int i = 0; i < world_size; i++) {
+            elem_elem_counts[i] = 0;
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]);
+            }
+
+            std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl;
+        }
+
+        // Print element-element connectivity entries for each rank in the initial mesh
+        for(int i = 0; i < world_size; i++) {
+            std::cout << std::endl;
+            std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "<<std::endl;
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                std::cout << "Element " << elements_to_send[i][k] << " has " << initial_mesh.num_elems_in_elem(elements_to_send[i][k]) << " element-element connectivity entries: ";
+                for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
+                    std::cout << initial_mesh.elems_in_elem(elements_to_send[i][k], l) << " ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    }
+    
+
+    // Define total_elem_elem_entries to be the sum of the elem_elem_counts
+    // Scatter the counts to each rank
+    MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT,
+                &total_elem_elem_entries, 1, MPI_INT,
+                0, MPI_COMM_WORLD);
+    
+    std::vector<int> elems_in_elem_on_rank(total_elem_elem_entries);
+    
+    // Now scatter the num_elems_in_elem for each element on each rank
+    std::vector<int> num_elems_in_elem_per_rank(num_elements_on_rank);
+    
+    if (rank == 0) {
+        std::vector<int> all_num_elems_in_elem;
+        std::vector<int> displs_ee(world_size);
+        int displacement = 0;
+        
+        for(int i = 0; i < world_size; i++) {
+            displs_ee[i] = displacement;
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k]));
+            }
+            displacement += elements_to_send[i].size();
+        }
+        
+        MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT,
+                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    } else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    
+    if (rank == 0){
+        // Prepare the element-element connectivity data for each rank
+        std::vector<int> all_elems_in_elem;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        
+        for(int i = 0; i < world_size; i++) {
+            sendcounts[i] = elem_elem_counts[i];
+            displs[i] = displacement;
+            
+            // Copy element-element connectivity for rank i
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
+                    all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l));
+                }
+            }
+            displacement += elem_elem_counts[i];
+        }
+
+        // Send the element-element connectivity data to each rank using MPI_Scatterv
+        MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0 && print_info) {
+        std::cout << "Rank " << rank << " received element-element connectivity (" 
+                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
+        
+        int offset = 0;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " has neighbors: ";
+            int num_neighbors = num_elems_in_elem_per_rank[elem];
+            for (int j = 0; j < num_neighbors; j++) {
+                std::cout << elems_in_elem_on_rank[offset + j] << " ";
+            }
+            offset += num_neighbors;
+            std::cout << std::endl;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 1 && print_info) {
+        std::cout << "Rank " << rank << " received element-element connectivity (" 
+                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
+        
+        int offset = 0;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " has neighbors: ";
+            int num_neighbors = num_elems_in_elem_per_rank[elem];
+            for (int j = 0; j < num_neighbors; j++) {
+                std::cout << elems_in_elem_on_rank[offset + j] << " ";
+            }
+            offset += num_neighbors;
+            std::cout << std::endl;
+        }
+    }
 
+    MPI_Barrier(MPI_COMM_WORLD);
 
 // ****************************************************************************************** 
 //     Initialize the mesh data structures for each rank
@@ -585,7 +711,163 @@ int main(int argc, char** argv) {
         write_vtk(mesh, node, rank);
     }
 
-   
+
+// ****************************************************************************************** 
+//     Repartition the mesh using pt-scotch
+// ****************************************************************************************** 
+
+
+
+    // --- Simple compact CSR build using global neighbor GIDs (recommended) ---
+    SCOTCH_Dgraph dgraph;
+    if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    const SCOTCH_Num baseval = 0;                       // 0-based
+    const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(mesh.num_elems);
+    const SCOTCH_Num vertlocmax = vertlocnbr;           // no holes
+
+    // Build compact CSR: vertloctab (size vertlocnbr+1) and edgeloctab (neighbors as GLOBAL elem GIDs)
+    std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
+    std::vector<SCOTCH_Num> edgeloctab;
+    edgeloctab.reserve(vertlocnbr * 6); // heuristic reserve
+
+    // Build the graph from elems_in_elem_on_rank which contains global neighbor IDs
+    // First, create a map from element GID to its position in elems_in_elem_on_rank
+    std::map<int, size_t> elem_gid_to_offset;
+    size_t current_offset = 0;
+    for (size_t k = 0; k < num_elements_on_rank; k++) {
+        elem_gid_to_offset[elements_on_rank[k]] = current_offset;
+        current_offset += num_elems_in_elem_per_rank[k];
+    }
+    
+    SCOTCH_Num offset = 0;
+    for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
+        vertloctab[lid] = offset;
+
+        // Get local element's global ID
+        int elem_gid = mesh.local_to_global_elem_mapping.host(lid);
+        
+        // Get the offset in elems_in_elem_on_rank for this element
+        size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
+        
+        // Get neighbor count - need to find the right index in elements_on_rank
+        size_t idx = 0;
+        for (size_t k = 0; k < num_elements_on_rank; k++) {
+            if (elements_on_rank[k] == elem_gid) {
+                idx = k;
+                break;
+            }
+        }
+        size_t num_nbrs = num_elems_in_elem_per_rank[idx];
+        
+        for (size_t j = 0; j < num_nbrs; ++j) {
+            // Get global neighbor ID from elems_in_elem_on_rank
+            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j];
+            edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
+            ++offset;
+        }
+    }
+    vertloctab[vertlocnbr] = offset;
+    const SCOTCH_Num edgelocnbr = offset;
+    const SCOTCH_Num edgelocsiz = edgelocnbr;
+
+    // Debug: print graph structure
+    if (print_info) {
+        std::cout << "Rank " << rank << ": vertlocnbr=" << vertlocnbr << ", edgelocnbr=" << edgelocnbr << std::endl;
+        std::cout << "vertloctab: ";
+        for (size_t i = 0; i <= vertlocnbr; i++) {
+            std::cout << vertloctab[i] << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "edgeloctab (first 20): ";
+        for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
+            std::cout << edgeloctab[i] << " ";
+        }
+        std::cout << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // NOTE: Using compact CSR => pass vendloctab = nullptr, vlblloctab = nullptr.
+    //       edgeloctab contains GLOBAL neighbor IDs; SCOTCH will discover remote vertices itself.
+    int rc = SCOTCH_dgraphBuild(&dgraph,
+                                baseval,
+                                vertlocnbr,
+                                vertlocmax,
+                                vertloctab.data(),   // compact offsets
+                                /*vendloctab*/ nullptr,
+                                /*veloloctab*/ nullptr,
+                                /*vlblloctab*/ nullptr,
+                                edgelocnbr,
+                                edgelocsiz,
+                                edgeloctab.data(),
+                                /*edgegsttab*/ nullptr,
+                                /*edloloctab*/ nullptr);
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    // Print graph info after build but before check
+    if (print_info) {
+        SCOTCH_Num vertlocnbr_out, vertloctab_size;
+        SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
+        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr=" << vertlocnbr_out << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Sanity check
+    rc = SCOTCH_dgraphCheck(&dgraph);
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    // Partition the mesh using pt-scotch
+    // Partition into world_size parts
+    // Note: Since we already have a distributed mesh, we're asking for a repartition
+    SCOTCH_Arch archdat;
+    SCOTCH_archInit(&archdat);
+    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size));
+    
+    SCOTCH_Strat stratdat;
+    SCOTCH_stratInit(&stratdat);
+    
+    std::vector<SCOTCH_Num> partloctab(vertlocnbr);
+    rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n";
+        SCOTCH_stratExit(&stratdat);
+        SCOTCH_archExit(&archdat);
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+    
+    SCOTCH_stratExit(&stratdat);
+    SCOTCH_archExit(&archdat);
+
+    // Print partition assignment (optional)
+    for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
+        size_t gid = mesh.local_to_global_elem_mapping.host(lid);
+        std::cout << "[rank " << rank << "] elem_local=" << lid << " gid=" << gid
+                << " -> part=" << partloctab[lid] << "\n";
+    }
+    
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+
+
+
+
+
 
     } // end MATAR scope
     MATAR_FINALIZE();

From 6145513dd7ccd49772a50b02f6f5c10c8fc2c755 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 27 Oct 2025 15:58:17 -0500
Subject: [PATCH 06/52] DOC: Adding documentation and comments for future
 Jacob, WIP

---
 examples/mesh_decomp/mesh_decomp.cpp | 228 +++++++++++++++++++--------
 1 file changed, 166 insertions(+), 62 deletions(-)

diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 34a1c683..1539fbb6 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -112,17 +112,21 @@ int main(int argc, char** argv) {
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {2, 2, 2};
+    int num_elems_dim[3] = {4, 4, 4};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
     node_t initial_node;
 
     // Create mesh, gauss points, and node data structures on each rank
+    // This is the initial partitioned mesh
     Mesh_t mesh;
     GaussPoint_t GaussPoints;
     node_t node;
 
+    // Mesh partitioned by pt-scotch
+    Mesh_t final_mesh; 
+
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
 
@@ -718,42 +722,112 @@ int main(int argc, char** argv) {
 
 
 
-    // --- Simple compact CSR build using global neighbor GIDs (recommended) ---
+    /**********************************************************************************
+     * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
+     **********************************************************************************
+     *
+     * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch
+     * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges
+     * correspond to mesh-neighbor relationships (i.e., elements that share a face or are
+     * otherwise neighbors per your mesh definition).
+     *
+     * We use the compact CSR (Compressed Sparse Row) representation, passing only the
+     * essential information required by PT-Scotch.
+     * 
+     * Variables and structures used:
+     *   - SCOTCH_Dgraph dgraph:
+     *       The distributed graph instance managed by PT-Scotch. Each MPI rank creates
+     *       and fills in its portion of the global graph.
+     * 
+     *   - const SCOTCH_Num baseval:
+     *       The base value for vertex and edge numbering. Set to 0 for C-style zero-based
+     *       arrays. Always use 0 unless you are using Fortran style 1-based arrays.
+     * 
+     *   - const SCOTCH_Num vertlocnbr:
+     *       The *number of local vertices* (mesh elements) defined on this MPI rank.
+     *       In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify
+     *       its own local vertex count.
+     *
+     *   - const SCOTCH_Num vertlocmax:
+     *       The *maximum number of local vertices* that could be stored (capacity). We
+     *       allocate with no unused holes, so vertlocmax = vertlocnbr.
+     *
+     *   - std::vector<SCOTCH_Num> vertloctab:
+     *       CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i]
+     *       gives the index in edgeloctab where the neighbor list of vertex i begins.
+     *       PT-Scotch expects this array to be of size vertlocnbr+1, where the difference
+     *       vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i.
+     *
+     *   - std::vector<SCOTCH_Num> edgeloctab:
+     *       CSR array [variable size]: a flattened list of *neighboring element global IDs*,
+     *       in no particular order. For vertex i, its neighbors are located at
+     *       edgeloctab[vertloctab[i]...vertloctab[i+1]-1].
+     *       In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to
+     *       recognize edges both within and across ranks.
+     *
+     *   - std::map<int, size_t> elem_gid_to_offset:
+     *       Helper map: For a given element global ID, gives the starting offset in 
+     *       the flattened neighbor array (elems_in_elem_on_rank) where this element's
+     *       list of neighbor GIDs begins. This allows efficient neighbor list lookup.
+     *
+     *   - (other arrays used, from mesh setup and communication phase)
+     *       - elements_on_rank: vector of global element IDs owned by this rank.
+     *       - num_elements_on_rank: number of owned elements.
+     *       - num_elems_in_elem_per_rank: array, for each owned element, how many
+     *         neighbors it has.
+     *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
+     *
+     **********************************************************************************/
+
+    // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
     SCOTCH_Dgraph dgraph;
     if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
         std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
         MPI_Abort(MPI_COMM_WORLD, 1);
     }
 
-    const SCOTCH_Num baseval = 0;                       // 0-based
+    // Set base value for numbering (0 for C-style arrays)
+    const SCOTCH_Num baseval = 0;
+
+    // vertlocnbr: Number of elements (vertices) that are local to this MPI rank
     const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(mesh.num_elems);
-    const SCOTCH_Num vertlocmax = vertlocnbr;           // no holes
 
-    // Build compact CSR: vertloctab (size vertlocnbr+1) and edgeloctab (neighbors as GLOBAL elem GIDs)
+    // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr)
+    const SCOTCH_Num vertlocmax = vertlocnbr;
+
+    // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) ---
+    // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins
     std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
+
+    // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
     std::vector<SCOTCH_Num> edgeloctab;
-    edgeloctab.reserve(vertlocnbr * 6); // heuristic reserve
+    edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance
 
-    // Build the graph from elems_in_elem_on_rank which contains global neighbor IDs
-    // First, create a map from element GID to its position in elems_in_elem_on_rank
+    // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
+    // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
     std::map<int, size_t> elem_gid_to_offset;
     size_t current_offset = 0;
     for (size_t k = 0; k < num_elements_on_rank; k++) {
         elem_gid_to_offset[elements_on_rank[k]] = current_offset;
         current_offset += num_elems_in_elem_per_rank[k];
     }
-    
-    SCOTCH_Num offset = 0;
+
+    // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
+    SCOTCH_Num offset = 0; // running count of edges encountered
+
     for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
+
+        // Record current edge offset for vertex lid in vertloctab
         vertloctab[lid] = offset;
 
-        // Get local element's global ID
+        // Obtain this local element's global ID (from mapping)
         int elem_gid = mesh.local_to_global_elem_mapping.host(lid);
-        
-        // Get the offset in elems_in_elem_on_rank for this element
+
+        // Find offset in the flattened neighbor array for this element's neighbor list
         size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
-        
-        // Get neighbor count - need to find the right index in elements_on_rank
+
+        // For this element, find the count of its neighbors
+        // This requires finding its index in the elements_on_rank array
         size_t idx = 0;
         for (size_t k = 0; k < num_elements_on_rank; k++) {
             if (elements_on_rank[k] == elem_gid) {
@@ -762,27 +836,33 @@ int main(int argc, char** argv) {
             }
         }
         size_t num_nbrs = num_elems_in_elem_per_rank[idx];
-        
+
+        // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
         for (size_t j = 0; j < num_nbrs; ++j) {
-            // Get global neighbor ID from elems_in_elem_on_rank
-            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j];
+            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
             edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
-            ++offset;
+            ++offset; // Increment running edge count
         }
     }
+
+    // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure
     vertloctab[vertlocnbr] = offset;
-    const SCOTCH_Num edgelocnbr = offset;
-    const SCOTCH_Num edgelocsiz = edgelocnbr;
 
-    // Debug: print graph structure
+    // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally
+    // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint)
+    const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees)
+    const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints
+
+    // Optionally print graph structure for debugging/validation
     if (print_info) {
-        std::cout << "Rank " << rank << ": vertlocnbr=" << vertlocnbr << ", edgelocnbr=" << edgelocnbr << std::endl;
-        std::cout << "vertloctab: ";
+        std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr
+                  << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl;
+        std::cout << "vertloctab (CSR row offsets): ";
         for (size_t i = 0; i <= vertlocnbr; i++) {
             std::cout << vertloctab[i] << " ";
         }
         std::cout << std::endl;
-        std::cout << "edgeloctab (first 20): ";
+        std::cout << "edgeloctab (first 20 neighbor GIDs): ";
         for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
             std::cout << edgeloctab[i] << " ";
         }
@@ -790,36 +870,48 @@ int main(int argc, char** argv) {
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // NOTE: Using compact CSR => pass vendloctab = nullptr, vlblloctab = nullptr.
-    //       edgeloctab contains GLOBAL neighbor IDs; SCOTCH will discover remote vertices itself.
-    int rc = SCOTCH_dgraphBuild(&dgraph,
-                                baseval,
-                                vertlocnbr,
-                                vertlocmax,
-                                vertloctab.data(),   // compact offsets
-                                /*vendloctab*/ nullptr,
-                                /*veloloctab*/ nullptr,
-                                /*vlblloctab*/ nullptr,
-                                edgelocnbr,
-                                edgelocsiz,
-                                edgeloctab.data(),
-                                /*edgegsttab*/ nullptr,
-                                /*edloloctab*/ nullptr);
+    /**************************************************************************
+     * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild
+     *
+     *   - PT-Scotch will use our CSR arrays. Since we use compact representation,
+     *     most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab")
+     *     can be passed as nullptr.
+     *   - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this
+     *     to discover connections across processor boundaries, so you do not have to
+     *     encode ownership or partition information yourself.
+     **************************************************************************/
+    int rc = SCOTCH_dgraphBuild(
+                &dgraph,
+                baseval,                // start index (0)
+                vertlocnbr,             // local vertex count (local elements)
+                vertlocmax,             // local vertex max (no holes)
+                vertloctab.data(),      // row offsets in edgeloctab
+                /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr)
+                /*veloloctab*/ nullptr, // vertex weights, not used
+                /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab)
+                edgelocnbr,             // local edge endpoints count
+                edgelocsiz,             // size of edge array
+                edgeloctab.data(),      // global neighbor IDs for each local node
+                /*edgegsttab*/ nullptr, // ghost edge array, not used
+                /*edloloctab*/ nullptr  // edge weights, not used
+    );
     if (rc != 0) {
         std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
         SCOTCH_dgraphFree(&dgraph);
         MPI_Abort(MPI_COMM_WORLD, rc);
     }
 
-    // Print graph info after build but before check
+    // Optionally, print rank summary after graph build for further validation
     if (print_info) {
-        SCOTCH_Num vertlocnbr_out, vertloctab_size;
+        SCOTCH_Num vertlocnbr_out;
         SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
-        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr=" << vertlocnbr_out << std::endl;
+        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl;
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Sanity check
+    /********************************************************
+     * Step 5: Validate the graph using SCOTCH_dgraphCheck
+     ********************************************************/
     rc = SCOTCH_dgraphCheck(&dgraph);
     if (rc != 0) {
         std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
@@ -827,16 +919,21 @@ int main(int argc, char** argv) {
         MPI_Abort(MPI_COMM_WORLD, rc);
     }
 
-    // Partition the mesh using pt-scotch
-    // Partition into world_size parts
-    // Note: Since we already have a distributed mesh, we're asking for a repartition
-    SCOTCH_Arch archdat;
+    /**************************************************************
+     * Step 6: Partition (repartition) the mesh using PT-Scotch
+     * - Each vertex (mesh element) will be assigned a part (mesh chunk).
+     * - Arch is initialized for a complete graph of world_size parts (one per rank).
+     * - Loki
+     **************************************************************/
+    SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
     SCOTCH_archInit(&archdat);
-    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size));
-    
-    SCOTCH_Strat stratdat;
+    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); // Partition into world_size complete nodes
+
+    SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
     SCOTCH_stratInit(&stratdat);
-    
+
+    // partloctab: output array mapping each local element (vertex) to a *target partition number*
+    // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
     std::vector<SCOTCH_Num> partloctab(vertlocnbr);
     rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
     if (rc != 0) {
@@ -846,20 +943,27 @@ int main(int argc, char** argv) {
         SCOTCH_dgraphFree(&dgraph);
         MPI_Abort(MPI_COMM_WORLD, rc);
     }
-    
+
+    // Clean up PT-Scotch strategy and architecture objects
     SCOTCH_stratExit(&stratdat);
     SCOTCH_archExit(&archdat);
 
-    // Print partition assignment (optional)
-    for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
-        size_t gid = mesh.local_to_global_elem_mapping.host(lid);
-        std::cout << "[rank " << rank << "] elem_local=" << lid << " gid=" << gid
-                << " -> part=" << partloctab[lid] << "\n";
+    /***************************************************************************
+     * Step 7 (Optional): Print out the partitioning assignment per element
+     * - Each local element's local index lid and global ID (gid) are listed with the
+     *   part to which PT-Scotch has assigned them.
+     ***************************************************************************/
+    for(int rank_id = 0; rank_id < world_size; rank_id++) {
+        if(rank_id == rank) {
+            for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
+                size_t gid = mesh.local_to_global_elem_mapping.host(lid);
+                std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
+                        << " -> part=" << partloctab[lid] << "\n";
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
     }
-    
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
 
 
 

From 91b3b8dace7e8e5c7cdbf80e053784d10c6c2acb Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 27 Oct 2025 16:51:43 -0500
Subject: [PATCH 07/52] ENH: Debugging repartition, nodal coordinates seem off

---
 examples/mesh_decomp/mesh_decomp.cpp | 237 ++++++++++++++++++++++++++-
 1 file changed, 234 insertions(+), 3 deletions(-)

diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 1539fbb6..3cd4c709 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -51,7 +51,7 @@ void print_rank_mesh_info(Mesh_t& mesh, int rank) {
     std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
 
     for (int i = 0; i < mesh.num_elems; i++) {
-        std::cout<<"Element "<<i<<" has nodes: ";
+        std::cout<<"Element "<<i<<" has nodes global id: "<<mesh.local_to_global_elem_mapping.host(i)<<" and local nodes:";
         for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
             std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
         }
@@ -112,7 +112,7 @@ int main(int argc, char** argv) {
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {4, 4, 4};
+    int num_elems_dim[3] = {2, 2, 2};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
@@ -126,6 +126,7 @@ int main(int argc, char** argv) {
 
     // Mesh partitioned by pt-scotch
     Mesh_t final_mesh; 
+    node_t final_node;
 
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
@@ -717,7 +718,7 @@ int main(int argc, char** argv) {
 
 
 // ****************************************************************************************** 
-//     Repartition the mesh using pt-scotch
+//     Compute a repartition of the mesh using pt-scotch
 // ****************************************************************************************** 
 
 
@@ -947,6 +948,9 @@ int main(int argc, char** argv) {
     // Clean up PT-Scotch strategy and architecture objects
     SCOTCH_stratExit(&stratdat);
     SCOTCH_archExit(&archdat);
+    
+    // Free the graph now that we have the partition assignments
+    SCOTCH_dgraphFree(&dgraph);
 
     /***************************************************************************
      * Step 7 (Optional): Print out the partitioning assignment per element
@@ -967,11 +971,238 @@ int main(int argc, char** argv) {
 
 
 
+// ****************************************************************************************** 
+//     Build the final mesh from the repartition
+// ****************************************************************************************** 
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n";
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // -------------- Phase 1: Determine elements to send to each rank --------------
+    std::vector<std::vector<int>> elems_to_send(world_size);
+    for (int lid = 0; lid < mesh.num_elems; ++lid) {
+        int dest = static_cast<int>(partloctab[lid]);
+        int elem_gid = static_cast<int>(mesh.local_to_global_elem_mapping.host(lid));
+        elems_to_send[dest].push_back(elem_gid);
+    }
+
+    // -------------- Phase 2: Exchange element GIDs --------------
+    std::vector<int> sendcounts(world_size), recvcounts(world_size);
+    for (int r = 0; r < world_size; ++r)
+        sendcounts[r] = static_cast<int>(elems_to_send[r].size());
+
+    MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Compute displacements
+    std::vector<int> sdispls(world_size), rdispls(world_size);
+    int send_total = 0, recv_total = 0;
+    for (int r = 0; r < world_size; ++r) {
+        sdispls[r] = send_total;
+        rdispls[r] = recv_total;
+        send_total += sendcounts[r];
+        recv_total += recvcounts[r];
+    }
+
+
+    // Flatten send buffer
+    std::vector<int> sendbuf;
+    sendbuf.reserve(send_total);
+    for (int r = 0; r < world_size; ++r)
+        sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end());
+
+    // Receive new local element GIDs
+    std::vector<int> recvbuf(recv_total);
+    MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT,
+                recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // New elements owned by this rank
+    std::vector<int> new_elem_gids = recvbuf;
+    int num_new_elems = static_cast<int>(new_elem_gids.size());
+    
+    
+    if (print_info) {
+        std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
+    }
+
+    // -------------- Phase 3: Send element–node connectivity --------------
+    int nodes_per_elem = mesh.num_nodes_in_elem;
+
+    // Flatten element-node connectivity by global node IDs
+    std::vector<int> conn_sendbuf;
+    for (int r = 0; r < world_size; ++r) {
+        for (int gid : elems_to_send[r]) {
+            // find local element lid from gid
+            int lid = -1;
+            for (int i = 0; i < mesh.num_elems; ++i)
+                if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+
+            for (int j = 0; j < nodes_per_elem; ++j) {
+                int node_lid = mesh.nodes_in_elem.host(lid, j);
+                int node_gid = mesh.local_to_global_node_mapping.host(node_lid);
+                conn_sendbuf.push_back(node_gid);
+            }
+        }
+    }
+
+    // element-node connectivity counts (ints per dest rank)
+    std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
+    for (int r = 0; r < world_size; ++r)
+        conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
+
+    MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
+    int conn_send_total = 0, conn_recv_total = 0;
+    for (int r = 0; r < world_size; ++r) {
+        conn_sdispls[r] = conn_send_total;
+        conn_rdispls[r] = conn_recv_total;
+        conn_send_total += conn_sendcounts[r];
+        conn_recv_total += conn_recvcounts[r];
+    }
+
+    std::vector<int> conn_recvbuf(conn_recv_total);
+    MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT,
+                conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    // -------------- Phase 4: Build new node list (unique GIDs) --------------
+    std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
+    std::vector<int> new_node_gids(node_gid_set.begin(), node_gid_set.end());
+    int num_new_nodes = static_cast<int>(new_node_gids.size());
+
+    // Build map gid→lid
+    std::unordered_map<int,int> node_gid_to_lid;
+    for (int i = 0; i < num_new_nodes; ++i)
+        node_gid_to_lid[new_node_gids[i]] = i;
+
+    if (print_info)
+        std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n";
 
 
+    // -------------- Phase 5: Request node coordinates --------------
+    std::vector<double> node_coords_sendbuf;
+    for (int r = 0; r < world_size; ++r) {
+        for (int gid : elems_to_send[r]) {
+            int lid = -1;
+            for (int i = 0; i < mesh.num_elems; ++i)
+                if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
 
+            for (int j = 0; j < nodes_per_elem; ++j) {
+                int node_lid = mesh.nodes_in_elem.host(lid, j);
+                int node_gid = mesh.local_to_global_node_mapping.host(node_lid);
+
+                node_coords_sendbuf.push_back(node.coords.host(node_lid, 0));
+                node_coords_sendbuf.push_back(node.coords.host(node_lid, 1));
+                node_coords_sendbuf.push_back(node.coords.host(node_lid, 2));
+            }
+        }
+    }
+
+    // Each node is 3 doubles; same sendcounts scaling applies
+    std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
+    for (int r = 0; r < world_size; ++r)
+        coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
+
+    MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
+    int coord_send_total = 0, coord_recv_total = 0;
+    for (int r = 0; r < world_size; ++r) {
+        coord_sdispls[r] = coord_send_total;
+        coord_rdispls[r] = coord_recv_total;
+        coord_send_total += coord_sendcounts[r];
+        coord_recv_total += coord_recvcounts[r];
+    }
+
+    std::vector<double> coord_recvbuf(coord_recv_total);
+    MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE,
+                coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // -------------- Phase 6: Build the final_mesh --------------
+    final_mesh.initialize_nodes(num_new_nodes);
+    final_mesh.initialize_elems(num_new_elems, mesh.num_dims);
+    final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
+    final_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
+
+    // Fill global mappings
+    for (int i = 0; i < num_new_nodes; ++i)
+        final_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
+    for (int i = 0; i < num_new_elems; ++i)
+        final_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
+
+    final_mesh.local_to_global_node_mapping.update_device();
+    final_mesh.local_to_global_elem_mapping.update_device();
+
+    // // Rebuild nodes_in_elem
+    // for (int e = 0; e < num_new_elems; ++e) {
+    //     for (int j = 0; j < nodes_per_elem; ++j) {
+    //         int node_gid = conn_recvbuf[e * nodes_per_elem + j];
+    //         int node_lid = node_gid_to_lid[node_gid];
+    //         final_mesh.nodes_in_elem.host(e, j) = node_lid;
+    //     }
+    // }
+    // final_mesh.nodes_in_elem.update_device();
+
+
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < num_new_elems; i++) {
+        for(int j = 0; j < nodes_per_elem; j++) {
+
+            int node_gid = conn_recvbuf[i * nodes_per_elem + j];
+
+            int node_lid = -1;
+
+            // Search through the local to global mapp to find the equivalent local index
+            for(int k = 0; k < num_new_nodes; k++){
+
+                if(node_gid == final_mesh.local_to_global_node_mapping.host(k)) {
+                    node_lid = k;
+                    break;
+                }
+            }
+
+            final_mesh.nodes_in_elem.host(i, j) = node_lid;
+        }
+    }
+
+    final_mesh.nodes_in_elem.update_device();
+
+    // Fill node coordinates
+    final_node.initialize(num_new_nodes, 3, {node_state::coords});
+    for (int i = 0; i < num_new_nodes; ++i) {
+        final_node.coords.host(i, 0) = coord_recvbuf[i*3 + 0];
+        final_node.coords.host(i, 1) = coord_recvbuf[i*3 + 1];
+        final_node.coords.host(i, 2) = coord_recvbuf[i*3 + 2];
+    }
+    final_node.coords.update_device();
+
+    // Connectivity rebuild
+    final_mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for(int i = 0; i < world_size; i++) {
+        if(rank == i) {
+            print_rank_mesh_info(final_mesh, i);
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
 
 
+    write_vtk(final_mesh, final_node, rank);
 
     } // end MATAR scope
     MATAR_FINALIZE();

From f40c187695e166ab31b2fcea84a6c4c0d4683126 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 27 Oct 2025 17:03:33 -0500
Subject: [PATCH 08/52] ENH: It works

---
 examples/mesh_decomp/mesh_decomp.cpp | 31 ++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 3cd4c709..feccca61 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -112,7 +112,7 @@ int main(int argc, char** argv) {
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {2, 2, 2};
+    int num_elems_dim[3] = {20, 20, 20};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
@@ -1181,11 +1181,34 @@ int main(int argc, char** argv) {
     final_mesh.nodes_in_elem.update_device();
 
     // Fill node coordinates
+    // coord_recvbuf contains coords in element-node order, but we need them in node order
+    // Build a map from node GID to coordinates
+    std::map<int, std::array<double, 3>> node_gid_to_coords;
+    int coord_idx = 0;
+    for (int e = 0; e < num_new_elems; ++e) {
+        for (int j = 0; j < nodes_per_elem; ++j) {
+            int node_gid = conn_recvbuf[e * nodes_per_elem + j];
+            if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
+                node_gid_to_coords[node_gid] = {
+                    coord_recvbuf[coord_idx*3 + 0],
+                    coord_recvbuf[coord_idx*3 + 1],
+                    coord_recvbuf[coord_idx*3 + 2]
+                };
+            }
+            coord_idx++;
+        }
+    }
+    
+    // Now fill coordinates in node order
     final_node.initialize(num_new_nodes, 3, {node_state::coords});
     for (int i = 0; i < num_new_nodes; ++i) {
-        final_node.coords.host(i, 0) = coord_recvbuf[i*3 + 0];
-        final_node.coords.host(i, 1) = coord_recvbuf[i*3 + 1];
-        final_node.coords.host(i, 2) = coord_recvbuf[i*3 + 2];
+        int node_gid = new_node_gids[i];
+        auto it = node_gid_to_coords.find(node_gid);
+        if (it != node_gid_to_coords.end()) {
+            final_node.coords.host(i, 0) = it->second[0];
+            final_node.coords.host(i, 1) = it->second[1];
+            final_node.coords.host(i, 2) = it->second[2];
+        }
     }
     final_node.coords.update_device();
 

From 35d0348d9619c0b83a18e1b762c327e6beea65e6 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 29 Oct 2025 09:54:47 -0500
Subject: [PATCH 09/52] ENH: Swapping to binary search and adding timers

---
 examples/mesh_decomp/mesh_decomp.cpp | 213 +++++++++++++++++++++------
 1 file changed, 166 insertions(+), 47 deletions(-)

diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index feccca61..d6144eaf 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -95,7 +95,7 @@ int main(int argc, char** argv) {
     Timer timer;
     timer.start();
 
-    bool print_info = true;
+    bool print_info = false;
     bool print_vtk = false;
 
 
@@ -112,7 +112,7 @@ int main(int argc, char** argv) {
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {20, 20, 20};
+    int num_elems_dim[3] = {100, 100, 100};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
@@ -156,6 +156,8 @@ int main(int argc, char** argv) {
 // ********************************************************  
 //              Build the initial mesh
 // ********************************************************  
+    double t_init_mesh_start = MPI_Wtime();
+
     if (rank == 0) {
         std::cout<<"World size: "<<world_size<<std::endl;
         std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
@@ -176,8 +178,12 @@ int main(int argc, char** argv) {
 
     // int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
     MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
-
     MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_init_mesh_end = MPI_Wtime();
+    if (rank == 0) {
+        std::cout << "Initial mesh generation + broadcast took " << (t_init_mesh_end - t_init_mesh_start) << " seconds." << std::endl;
+    }
     
 // ********************************************************  
 //        Scatter the number of elements to each rank
@@ -187,6 +193,7 @@ int main(int argc, char** argv) {
     // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
     //             void *recvbuf, int recvcount, MPI_Datatype recvtype,
     //             int root, MPI_Comm comm)
+    double t_scatter_start = MPI_Wtime();
     MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
                 &num_elements_on_rank, 1, MPI_INT, 
                 0, MPI_COMM_WORLD);
@@ -197,11 +204,18 @@ int main(int argc, char** argv) {
     elements_on_rank.resize(num_elements_on_rank);
     
 
-
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_scatter_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the number of elements to each rank"<<std::endl;
+        std::cout << " Scatter operation took " << (t_scatter_end - t_scatter_start) << " seconds." << std::endl;
+    }
 
 // ********************************************************  
 //     Scatter the actual element global ids to each rank
 // ******************************************************** 
+    double t_scatter_gids_start = MPI_Wtime();
+
     if (rank == 0) {
 
         //print elements per rank
@@ -253,9 +267,15 @@ int main(int argc, char** argv) {
                      elements_on_rank.data(), num_elements_on_rank, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
-    
 
     MPI_Barrier(MPI_COMM_WORLD);
+    double t_scatter_gids_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the actual element global ids to each rank"<<std::endl;
+        std::cout << " Scattering the actual element global ids to each rank took " 
+                  << (t_scatter_gids_end - t_scatter_gids_start) << " seconds." << std::endl;
+    }
+    
 
     if (print_info) {
         std::cout << "Rank " << rank << " received elements: ";
@@ -265,13 +285,18 @@ int main(int argc, char** argv) {
         std::cout << std::endl;
     }
     
+
     MPI_Barrier(MPI_COMM_WORLD);
+    
 
 
 // ****************************************************************************************** 
 //     Scatter the number of nodes to each rank and compute which nodes to send to each rank
 // ****************************************************************************************** 
 
+    // Timer: Start measuring time for node scattering
+    double t_scatter_nodes_start = MPI_Wtime();
+
     if (rank == 0) {
 
         // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
@@ -285,11 +310,13 @@ int main(int argc, char** argv) {
             nodes_to_send[i] = std::vector<int>(nodes_set.begin(), nodes_set.end());
         } 
 
+        for (int i = 0; i < world_size; i++) {
+            nodes_per_rank[i] = nodes_to_send[i].size();
+        }
+
         if (print_info) {
 
-            for (int i = 0; i < world_size; i++) {
-                nodes_per_rank[i] = nodes_to_send[i].size();
-            }
+            
             std::cout<<std::endl;
             // print the nodes_to_send array
             for (int i = 0; i < world_size; i++) {
@@ -299,8 +326,8 @@ int main(int argc, char** argv) {
 
                 for (int j = 0; j < nodes_to_send[i].size(); j++) {
                     std::cout<<nodes_to_send[i][j]<<" ";
-        }
-        std::cout<<std::endl;
+                }
+                std::cout<<std::endl;
             }
         }
     }
@@ -308,19 +335,33 @@ int main(int argc, char** argv) {
     // Send the number of nodes to each rank using MPI_scatter
     MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
 
+    // resize the nodes_on_rank vector to hold the received data
+    nodes_on_rank.resize(num_nodes_on_rank);
+
     MPI_Barrier(MPI_COMM_WORLD);
 
     if (print_info) {
         std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
     }
 
-    // resize the nodes_on_rank vector to hold the received data
-    nodes_on_rank.resize(num_nodes_on_rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Timer: End measuring time for node scattering
+    double t_scatter_nodes_end = MPI_Wtime();
+
+    if(rank == 0) {
+        std::cout<<" Finished scattering the number of nodes to each rank"<<std::endl;
+        std::cout << " Scattering the number of nodes to each rank took " 
+                  << (t_scatter_nodes_end - t_scatter_nodes_start) << " seconds." << std::endl;
+    }
 
 
 // ****************************************************************************************** 
 //     Scatter the actual node global ids to each rank
 // ****************************************************************************************** 
+    // Timer: Start measuring time for scattering node global ids
+    double t_scatter_nodeids_start = MPI_Wtime();
+
     if (rank == 0) {
 
         // Prepare data for MPI_Scatterv (scatter with variable counts)
@@ -354,21 +395,32 @@ int main(int argc, char** argv) {
                      0, MPI_COMM_WORLD);
     }
     else {
-
         MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
             nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
             0, MPI_COMM_WORLD);
-
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    // Timer: End measuring time for scattering node global ids
+    double t_scatter_nodeids_end = MPI_Wtime();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) {
+        std::cout<<" Finished scattering the actual node global ids to each rank"<<std::endl;
+        std::cout << " Scattering node global ids took "
+                  << (t_scatter_nodeids_end - t_scatter_nodeids_start) << " seconds." << std::endl;
+    }
+
 // ****************************************************************************************** 
 //     Scatter the node positions to each rank
 // ****************************************************************************************** 
     // Create a flat 1D vector for node positions (3 coordinates per node)
     std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
 
+    // Timer for scattering node positions
+    double t_scatter_nodepos_start = MPI_Wtime();
+
     if(rank == 0)
     {
         for (int i = 0; i < world_size; i++) {
@@ -437,6 +489,15 @@ int main(int argc, char** argv) {
         std::cout << std::endl;
     }
 
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_scatter_nodepos_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the node positions to each rank"<<std::endl;
+        std::cout << " Scattering node positions took "
+                  << (t_scatter_nodepos_end - t_scatter_nodepos_start) << " seconds." << std::endl;
+    }
+
 // ****************************************************************************************** 
 //     Initialize the node state variables
 // ****************************************************************************************** 
@@ -460,6 +521,8 @@ int main(int argc, char** argv) {
     // Send the element-node connectivity data from the initial mesh to each rank
     std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
     
+    double t_scatter_elemnode_start = MPI_Wtime();
+
     if (rank == 0) {
         // Prepare element-node connectivity data for each rank
         std::vector<int> all_nodes_in_elem;
@@ -493,6 +556,13 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    double t_scatter_elemnode_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl;
+        std::cout << " Scattering element-node connectivity took "
+                  << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl;
+    }
+
     if (rank == 0 && print_info) {
 
         std::cout << "Rank " << rank << " received element-node connectivity (" 
@@ -508,6 +578,7 @@ int main(int argc, char** argv) {
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"<<std::endl;
 
 
 // ****************************************************************************************** 
@@ -520,6 +591,8 @@ int main(int argc, char** argv) {
     int total_elem_elem_entries = 0;
     
     
+    double t_scatter_elem_elem_start = MPI_Wtime();
+
     if (rank == 0){
         // Calculate total number of connectivity entries for each rank
         for(int i = 0; i < world_size; i++) {
@@ -528,21 +601,23 @@ int main(int argc, char** argv) {
                 elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]);
             }
 
-            std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl;
+            if(print_info) std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl;
         }
 
         // Print element-element connectivity entries for each rank in the initial mesh
-        for(int i = 0; i < world_size; i++) {
-            std::cout << std::endl;
-            std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "<<std::endl;
-            for(int k = 0; k < elements_to_send[i].size(); k++) {
-                std::cout << "Element " << elements_to_send[i][k] << " has " << initial_mesh.num_elems_in_elem(elements_to_send[i][k]) << " element-element connectivity entries: ";
-                for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
-                    std::cout << initial_mesh.elems_in_elem(elements_to_send[i][k], l) << " ";
+        if(print_info) {
+            for(int i = 0; i < world_size; i++) {
+                std::cout << std::endl;
+                std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "<<std::endl;
+                for(int k = 0; k < elements_to_send[i].size(); k++) {
+                    std::cout << "Element " << elements_to_send[i][k] << " has " << initial_mesh.num_elems_in_elem(elements_to_send[i][k]) << " element-element connectivity entries: ";
+                    for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
+                        std::cout << initial_mesh.elems_in_elem(elements_to_send[i][k], l) << " ";
+                    }
+                    std::cout << std::endl;
                 }
                 std::cout << std::endl;
             }
-            std::cout << std::endl;
         }
     }
     
@@ -552,7 +627,15 @@ int main(int argc, char** argv) {
     MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT,
                 &total_elem_elem_entries, 1, MPI_INT,
                 0, MPI_COMM_WORLD);
-    
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_scatter_elem_elem_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the number of element-element connectivity entries to each rank"<<std::endl;
+        std::cout<<" Scattering element-element connectivity counts took "
+                 << (t_scatter_elem_elem_end - t_scatter_elem_elem_start) << " seconds." << std::endl;
+    }
+
     std::vector<int> elems_in_elem_on_rank(total_elem_elem_entries);
     
     // Now scatter the num_elems_in_elem for each element on each rank
@@ -580,6 +663,9 @@ int main(int argc, char** argv) {
                      0, MPI_COMM_WORLD);
     }
     
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"<<std::endl;
+
     if (rank == 0){
         // Prepare the element-element connectivity data for each rank
         std::vector<int> all_elems_in_elem;
@@ -614,6 +700,9 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"<<std::endl;
+
     if (rank == 0 && print_info) {
         std::cout << "Rank " << rank << " received element-element connectivity (" 
                 << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
@@ -670,6 +759,12 @@ int main(int argc, char** argv) {
     mesh.local_to_global_node_mapping.update_device();
     mesh.local_to_global_elem_mapping.update_device();
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    // Timer for reverse mapping of element-node connectivity
+    double t_reverse_map_start = MPI_Wtime();
+
     // rebuild the local element-node connectivity using the local node ids
     for(int i = 0; i < num_elements_on_rank; i++) {
         for(int j = 0; j < num_nodes_per_elem; j++) {
@@ -678,12 +773,18 @@ int main(int argc, char** argv) {
 
             int node_lid = -1;
 
-            // Search through the local to global mapp to find the equivalent local index
-            for(int k = 0; k < num_nodes_on_rank; k++){
-
-                if(node_gid == mesh.local_to_global_node_mapping.host(k)) {
-                    node_lid = k;
+            // Use binary search to find the local node index for node_gid
+            int left = 0, right = num_nodes_on_rank - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
                     break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
                 }
             }
 
@@ -691,6 +792,13 @@ int main(int argc, char** argv) {
         }
     }
 
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_reverse_map_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+        std::cout<<" Reverse mapping time: " << (t_reverse_map_end - t_reverse_map_start) << " seconds." << std::endl;
+    }
 
     mesh.nodes_in_elem.update_device();
 
@@ -910,6 +1018,9 @@ int main(int argc, char** argv) {
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<<std::endl;
+
     /********************************************************
      * Step 5: Validate the graph using SCOTCH_dgraphCheck
      ********************************************************/
@@ -932,6 +1043,7 @@ int main(int argc, char** argv) {
 
     SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
     SCOTCH_stratInit(&stratdat);
+    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); // zero is recursion count, 0=automatic
 
     // partloctab: output array mapping each local element (vertex) to a *target partition number*
     // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
@@ -958,7 +1070,7 @@ int main(int argc, char** argv) {
      *   part to which PT-Scotch has assigned them.
      ***************************************************************************/
     for(int rank_id = 0; rank_id < world_size; rank_id++) {
-        if(rank_id == rank) {
+        if(rank_id == rank && print_info) {
             for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
                 size_t gid = mesh.local_to_global_elem_mapping.host(lid);
                 std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
@@ -1021,6 +1133,7 @@ int main(int argc, char** argv) {
                 recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
     
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element GIDs"<<std::endl;
 
     // New elements owned by this rank
     std::vector<int> new_elem_gids = recvbuf;
@@ -1057,7 +1170,10 @@ int main(int argc, char** argv) {
         conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
 
     MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"<<std::endl;
 
     std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
     int conn_send_total = 0, conn_recv_total = 0;
@@ -1073,7 +1189,7 @@ int main(int argc, char** argv) {
                 conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
-
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"<<std::endl;
 
     // -------------- Phase 4: Build new node list (unique GIDs) --------------
     std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
@@ -1115,6 +1231,7 @@ int main(int argc, char** argv) {
 
     MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<<std::endl;
 
     std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
     int coord_send_total = 0, coord_recv_total = 0;
@@ -1130,6 +1247,7 @@ int main(int argc, char** argv) {
                 coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
 
     // -------------- Phase 6: Build the final_mesh --------------
     final_mesh.initialize_nodes(num_new_nodes);
@@ -1146,17 +1264,9 @@ int main(int argc, char** argv) {
     final_mesh.local_to_global_node_mapping.update_device();
     final_mesh.local_to_global_elem_mapping.update_device();
 
-    // // Rebuild nodes_in_elem
-    // for (int e = 0; e < num_new_elems; ++e) {
-    //     for (int j = 0; j < nodes_per_elem; ++j) {
-    //         int node_gid = conn_recvbuf[e * nodes_per_elem + j];
-    //         int node_lid = node_gid_to_lid[node_gid];
-    //         final_mesh.nodes_in_elem.host(e, j) = node_lid;
-    //     }
-    // }
-    // final_mesh.nodes_in_elem.update_device();
-
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
     // rebuild the local element-node connectivity using the local node ids
     for(int i = 0; i < num_new_elems; i++) {
         for(int j = 0; j < nodes_per_elem; j++) {
@@ -1165,12 +1275,18 @@ int main(int argc, char** argv) {
 
             int node_lid = -1;
 
-            // Search through the local to global mapp to find the equivalent local index
-            for(int k = 0; k < num_new_nodes; k++){
-
-                if(node_gid == final_mesh.local_to_global_node_mapping.host(k)) {
-                    node_lid = k;
+            // Binary search through local_to_global_node_mapping to find the equivalent local index
+            int left = 0, right = num_new_nodes - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = final_mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
                     break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
                 }
             }
 
@@ -1178,6 +1294,9 @@ int main(int argc, char** argv) {
         }
     }
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
     final_mesh.nodes_in_elem.update_device();
 
     // Fill node coordinates
@@ -1217,7 +1336,7 @@ int main(int argc, char** argv) {
     MPI_Barrier(MPI_COMM_WORLD);
 
     for(int i = 0; i < world_size; i++) {
-        if(rank == i) {
+        if(rank == i && print_info) {
             print_rank_mesh_info(final_mesh, i);
         }
         MPI_Barrier(MPI_COMM_WORLD);

From e20d819dc5f4148b3aa1f184d02a8738a5140cfa Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 29 Oct 2025 12:07:45 -0500
Subject: [PATCH 10/52] ENH: Adding ghost elements WIP

---
 examples/mesh_decomp/mesh.h          |   7 +-
 examples/mesh_decomp/mesh_decomp.cpp | 213 ++++++++++++++++++++++++++-
 examples/mesh_decomp/mesh_io.h       |  33 +++--
 3 files changed, 236 insertions(+), 17 deletions(-)

diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
index 0011d2e8..92f3bcdf 100644
--- a/examples/mesh_decomp/mesh.h
+++ b/examples/mesh_decomp/mesh.h
@@ -308,16 +308,19 @@ struct Mesh_t
 
     // MPI Decomposition Data Definitions ---- //
     DCArrayKokkos<size_t> local_to_global_node_mapping; ///< Local to global node mapping
-    
     DCArrayKokkos<size_t> local_to_global_elem_mapping; ///< Local to global element mapping
+
+    // Data structure for ghost elements required for MPI comms
+    size_t num_ghost_elems; ///< Number of ghost elements on this rank (from neighboring MPI ranks)
     
 
 
+
+
     // initialization methods
     void initialize_nodes(const size_t num_nodes_inp)
     {
         num_nodes = num_nodes_inp;
-
         return;
     }; // end method
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index d6144eaf..564dd3f1 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <mpi.h>
 #include <set>
+#include <map>
 
 
 #include "mesh.h"
@@ -112,7 +113,7 @@ int main(int argc, char** argv) {
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {100, 100, 100};
+    int num_elems_dim[3] = {2, 2, 2};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
@@ -1069,6 +1070,7 @@ int main(int argc, char** argv) {
      * - Each local element's local index lid and global ID (gid) are listed with the
      *   part to which PT-Scotch has assigned them.
      ***************************************************************************/
+    print_info = true;
     for(int rank_id = 0; rank_id < world_size; rank_id++) {
         if(rank_id == rank && print_info) {
             for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
@@ -1080,6 +1082,7 @@ int main(int argc, char** argv) {
         }
         MPI_Barrier(MPI_COMM_WORLD);
     }
+    print_info = false;
 
 
 
@@ -1335,6 +1338,214 @@ int main(int argc, char** argv) {
     final_mesh.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
 
+
+
+// ****************************************************************************************** 
+//     Build the ghost elements
+// ****************************************************************************************** 
+
+    double t_ghost_start = MPI_Wtime();
+    
+    // Update host arrays for ghost detection
+    final_mesh.local_to_global_elem_mapping.update_host();
+    final_mesh.local_to_global_node_mapping.update_host();
+    final_mesh.nodes_in_elem.update_host();
+    Kokkos::fence();
+    
+    // Build a set of locally-owned element global IDs for fast lookup
+    std::set<size_t> local_elem_gids;
+    for (int i = 0; i < num_new_elems; ++i) {
+        local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i));
+    }
+    
+    // Exchange element GIDs with all ranks to know who owns what
+    // Collect all locally-owned element global IDs to send to other ranks
+    std::vector<size_t> local_elem_gids_vec(local_elem_gids.begin(), local_elem_gids.end());
+    
+    // First, gather the number of elements each rank owns
+    std::vector<int> elem_counts(world_size);
+    int local_elem_count = static_cast<int>(local_elem_gids_vec.size());
+    
+    MPI_Allgather(&local_elem_count, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    
+    // Compute displacements
+    std::vector<int> elem_displs(world_size);
+    int total_elems = 0;
+    for (int r = 0; r < world_size; ++r) {
+        elem_displs[r] = total_elems;
+        total_elems += elem_counts[r];
+    }
+    
+    // Gather all element GIDs from all ranks
+    std::vector<size_t> all_elem_gids(total_elems);
+    MPI_Allgatherv(local_elem_gids_vec.data(), local_elem_count, MPI_UNSIGNED_LONG_LONG,
+                   all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    
+    // Build a map: element GID -> owning rank
+    std::map<size_t, int> elem_gid_to_rank;
+    for (int r = 0; r < world_size; ++r) {
+        for (int i = 0; i < elem_counts[r]; ++i) {
+            size_t gid = all_elem_gids[elem_displs[r] + i];
+            elem_gid_to_rank[gid] = r;
+        }
+    }
+    
+    // Strategy: Find ghost elements by checking neighbors of our boundary elements.
+    // A boundary element is one that has a neighbor owned by another rank.
+    // However, since build_connectivity() only includes locally-owned elements,
+    // we need to use a different approach: find elements on other ranks that share
+    // nodes with our locally-owned elements.
+    
+    // First, collect all nodes that belong to our locally-owned elements
+    std::set<size_t> local_elem_nodes;
+    for (int lid = 0; lid < num_new_elems; ++lid) {
+        for (int j = 0; j < nodes_per_elem; ++j) {
+            size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
+            local_elem_nodes.insert(node_gid);
+        }
+    }
+    
+    // Now collect element-to-node connectivity to send to all ranks
+    // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid)
+    std::vector<size_t> elem_node_conn;
+    int local_conn_size = 0;
+    
+    for (int lid = 0; lid < num_new_elems; ++lid) {
+        size_t elem_gid = final_mesh.local_to_global_elem_mapping.host(lid);
+        for (int j = 0; j < nodes_per_elem; ++j) {
+            size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
+            elem_node_conn.push_back(elem_gid);
+            elem_node_conn.push_back(node_gid);
+        }
+        local_conn_size += nodes_per_elem * 2;  // Each pair is 2 size_ts
+    }
+    
+    // Exchange element-node connectivity with all ranks using Allgather
+    // First, gather the sizes from each rank
+    std::vector<int> conn_sizes(world_size);
+    MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    
+    // Compute displacements
+    std::vector<int> conn_displs(world_size);
+    int total_conn = 0;
+    for (int r = 0; r < world_size; ++r) {
+        conn_displs[r] = total_conn;
+        total_conn += conn_sizes[r];
+    }
+    
+    // Gather all element-node pairs from all ranks
+    std::vector<size_t> all_conn(total_conn);
+    MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
+                   all_conn.data(), conn_sizes.data(), conn_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    
+    // Build a map: node GID -> set of element GIDs that contain it (from other ranks)
+    std::map<size_t, std::set<size_t>> node_to_ext_elem;
+    for (int r = 0; r < world_size; ++r) {
+        if (r == rank) continue;  // Skip our own data
+        // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        for (int i = 0; i < num_pairs; ++i) {
+            // Each pair is 2 size_ts, starting at conn_displs[r]
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this node is in one of our elements, then the element is a potential ghost
+            if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
+                // Check if this element is not owned by us
+                if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
+                    node_to_ext_elem[node_gid].insert(elem_gid);
+                }
+            }
+        }
+    }
+    
+    // Collect all unique ghost element GIDs
+    std::set<size_t> ghost_elem_gids;
+    for (const auto& pair : node_to_ext_elem) {
+        for (size_t elem_gid : pair.second) {
+            ghost_elem_gids.insert(elem_gid);
+        }
+    }
+    
+    // Additional check: elements that are neighbors of our locally-owned elements
+    // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
+    
+    for (int lid = 0; lid < num_new_elems; ++lid) {
+        size_t num_neighbors = final_mesh.num_elems_in_elem(lid);
+        
+        for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
+            size_t neighbor_lid = final_mesh.elems_in_elem(lid, nbr_idx);
+            
+            if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
+                size_t neighbor_gid = final_mesh.local_to_global_elem_mapping(neighbor_lid);
+                
+                // Check if neighbor is owned by this rank
+                auto it = elem_gid_to_rank.find(neighbor_gid);
+                if (it != elem_gid_to_rank.end() && it->second != rank) {
+                    // Neighbor is owned by another rank - it's a ghost for us
+                    ghost_elem_gids.insert(neighbor_gid);
+                }
+            }
+        }
+    }
+    
+    // Count unique ghost elements
+    final_mesh.num_ghost_elems = ghost_elem_gids.size();
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_ghost_end = MPI_Wtime();
+    
+    if (rank == 0) {
+        std::cout << " Finished calculating ghost elements" << std::endl;
+        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
+    }
+    
+    // Print ghost element info if requested
+    print_info = true;
+    for(int i = 0; i < world_size; i++) {
+        if(rank == i && print_info) {
+            std::cout << "[rank " << rank << "] owns " << num_new_elems 
+                  << " elements and has " << final_mesh.num_ghost_elems << " ghost elements" << std::endl;
+            std::cout << "[rank " << rank << "] owned element global IDs: ";
+            for (int j = 0; j < num_new_elems; ++j) {
+                std::cout << final_mesh.local_to_global_elem_mapping(j) << " ";
+            }
+            std::cout << std::endl;
+            
+            
+            
+            std::cout << "[rank " << rank << "] ghost element GIDs: ";
+            for (size_t gid : ghost_elem_gids) {
+                std::cout << gid << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    
+
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
     for(int i = 0; i < world_size; i++) {
         if(rank == i && print_info) {
             print_rank_mesh_info(final_mesh, i);
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 95db8132..b044b599 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -231,16 +231,16 @@ void build_3d_box(
 
 
 /////////////////////////////////////////////////////////////////////////////
-    ///
-    /// \fn write_vtk
-    ///
-    /// \brief Writes a vtk output file
-    ///
-    /// \param mesh mesh
-    /// \param node node data
-    /// \param rank rank
-    ///
-    /////////////////////////////////////////////////////////////////////////////
+///
+/// \fn write_vtk
+///
+/// \brief Writes a vtk output file
+///
+/// \param mesh mesh
+/// \param node node data
+/// \param rank rank
+///
+/////////////////////////////////////////////////////////////////////////////
     void write_vtk(Mesh_t& mesh,
         node_t& node,
         int rank)
@@ -276,7 +276,7 @@ void build_3d_box(
         Kokkos::fence();
 
 
-        const int num_cell_scalar_vars = 1;
+        const int num_cell_scalar_vars = 2;
         const int num_cell_vec_vars    = 0;
         const int num_cell_tensor_vars = 0;
 
@@ -285,8 +285,8 @@ void build_3d_box(
 
 
         // Scalar values associated with a cell
-        const char cell_scalar_var_names[num_cell_scalar_vars][15] = {
-            "rank_id"
+        const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
+            "rank_id", "elems_in_elem_owned"
         };
         
         // const char cell_vec_var_names[num_cell_vec_vars][15] = {
@@ -317,6 +317,11 @@ void build_3d_box(
         // export material centeric data to the elements
         elem_fields(0, 0) = rank;
 
+        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+            elem_fields(elem_gid, 0) = rank;
+            elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
+        }
+
 
         // save the vertex vector fields to an array for exporting to graphics files
         CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
@@ -454,7 +459,7 @@ void build_3d_box(
             fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4]
             fprintf(out[0], "LOOKUP_TABLE default\n");
             for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
-                fprintf(out[0], "%f\n", rank);
+                fprintf(out[0], "%f\n",  elem_fields(elem_gid, var));
             } // end for elem
         } // end for cell scalar_vars
 

From 659d72da6f6f89bcf361ba60c850c7b9cea657c4 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 31 Oct 2025 10:53:44 -0500
Subject: [PATCH 11/52] ENH: Adding ghost elements and nodes

---
 examples/mesh_decomp/mesh_decomp.cpp | 669 +++++++++++++++++++++++----
 examples/mesh_decomp/mesh_io.h       |   6 +-
 2 files changed, 574 insertions(+), 101 deletions(-)

diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 564dd3f1..85f2b7f2 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -16,86 +16,54 @@
 #include "scotch.h"
 #include "ptscotch.h"
 
-// Timer class for timing the execution of the matrix multiplication
-class Timer {
-    private:
-        std::chrono::high_resolution_clock::time_point start_time;
-        std::chrono::high_resolution_clock::time_point end_time;
-        bool is_running;
-    
-    public:
-        Timer() : is_running(false) {}
-        
-        void start() {
-            start_time = std::chrono::high_resolution_clock::now();
-            is_running = true;
-        }
-        
-        double stop() {
-            if (!is_running) {
-                std::cerr << "Timer was not running!" << std::endl;
-                return 0.0;
-            }
-            end_time = std::chrono::high_resolution_clock::now();
-            is_running = false;
-            
-            auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-            return duration.count() / 1000.0; // Convert to milliseconds
-        }
-};
 
-void print_rank_mesh_info(Mesh_t& mesh, int rank) {
+void calc_elements_per_rank(std::vector<int>& elems_per_rank, int num_elems, int world_size){
+    // Compute elements to send to each rank; handle remainders for non-even distribution
+    std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size);
+    int remainder = num_elems % world_size;
+    for (int i = 0; i < remainder; ++i) {
+        elems_per_rank[i] += 1;
+    }
+}
 
-    std::cout<<std::endl;
-    std::cout<<"Rank "<<rank<<" printing mesh info"<<std::endl;
+void print_mesh_info(Mesh_t& mesh){
     std::cout<<"Mesh has "<<mesh.num_elems<<" elements"<<std::endl;
     std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
 
     for (int i = 0; i < mesh.num_elems; i++) {
-        std::cout<<"Element "<<i<<" has nodes global id: "<<mesh.local_to_global_elem_mapping.host(i)<<" and local nodes:";
+        std::cout<<"Element "<<i<<" has nodes: ";
         for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
             std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
         }
         std::cout<<std::endl;
-        std::cout<<"Which have global indices of : ";
-        for (int k = 0; k < mesh.num_nodes_in_elem; k++) {
-            std::cout<<mesh.local_to_global_node_mapping.host(mesh.nodes_in_elem.host(i, k))<<" ";
-        }
-        std::cout<<std::endl;
     }
     std::cout<<std::endl;
 }
 
-void calc_elements_per_rank(std::vector<int>& elems_per_rank, int num_elems, int world_size){
-    // Compute elements to send to each rank; handle remainders for non-even distribution
-    std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size);
-    int remainder = num_elems % world_size;
-    for (int i = 0; i < remainder; ++i) {
-        elems_per_rank[i] += 1;
-    }
-}
+void print_rank_mesh_info(Mesh_t& mesh, int rank) {
 
-void print_mesh_info(Mesh_t& mesh){
+    std::cout<<std::endl;
+    std::cout<<"Rank "<<rank<<" printing mesh info"<<std::endl;
     std::cout<<"Mesh has "<<mesh.num_elems<<" elements"<<std::endl;
     std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
 
     for (int i = 0; i < mesh.num_elems; i++) {
-        std::cout<<"Element "<<i<<" has nodes: ";
+        std::cout<<"Element "<<i<<" has nodes global id: "<<mesh.local_to_global_elem_mapping.host(i)<<" and local nodes:";
         for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
             std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
         }
         std::cout<<std::endl;
+        std::cout<<"Which have global indices of : ";
+        for (int k = 0; k < mesh.num_nodes_in_elem; k++) {
+            std::cout<<mesh.local_to_global_node_mapping.host(mesh.nodes_in_elem.host(i, k))<<" ";
+        }
+        std::cout<<std::endl;
     }
     std::cout<<std::endl;
 }
 
-
 int main(int argc, char** argv) {
 
-    // Create and start timer
-    Timer timer;
-    timer.start();
-
     bool print_info = false;
     bool print_vtk = false;
 
@@ -110,10 +78,13 @@ int main(int argc, char** argv) {
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
 
+    double t_main_start = MPI_Wtime();
+
+
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {2, 2, 2};
+    int num_elems_dim[3] = {4, 4, 1};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
@@ -129,6 +100,9 @@ int main(int argc, char** argv) {
     Mesh_t final_mesh; 
     node_t final_node;
 
+    Mesh_t mesh_with_ghosts;
+    node_t node_with_ghosts;
+
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
 
@@ -948,7 +922,7 @@ int main(int argc, char** argv) {
         size_t num_nbrs = num_elems_in_elem_per_rank[idx];
 
         // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
-        for (size_t j = 0; j < num_nbrs; ++j) {
+        for (size_t j = 0; j < num_nbrs; j++) {
             size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
             edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
             ++offset; // Increment running edge count
@@ -1036,15 +1010,84 @@ int main(int argc, char** argv) {
      * Step 6: Partition (repartition) the mesh using PT-Scotch
      * - Each vertex (mesh element) will be assigned a part (mesh chunk).
      * - Arch is initialized for a complete graph of world_size parts (one per rank).
-     * - Loki
      **************************************************************/
+    // SCOTCH_Arch controls the "architecture" for partitioning: the topology
+    // (number and connectivity of parts) to which the graph will be mapped.
+    // The archdat variable encodes this. Below are common options:
+    //
+    // - SCOTCH_archCmplt(&archdat, nbparts)
+    //     * Creates a "complete graph" architecture with nbparts nodes (fully connected).
+    //       Every part is equally distant from every other part.
+    //       This is typically used when minimizing only *balance* and *edge cut*,
+    //       not considering any underlying machine topology.
+    //
+    // - SCOTCH_archHcub(&archdat, dimension)
+    //     * Hypercube architecture (rare in modern use).
+    //       Sets up a hypercube of given dimension.
+    //
+    // - SCOTCH_archTleaf / SCOTCH_archTleafX
+    //     * Tree architectures, for hierarchically structured architectures.
+    //
+    // - SCOTCH_archMesh2 / SCOTCH_archMesh3
+    //     * 2D or 3D mesh topology architectures (useful for grid/matrix machines).
+    //
+    // - SCOTCH_archBuild
+    //     * General: builds any architecture from a descriptor string.
+    //
+    // For distributed mesh partitioning to MPI ranks (where all ranks are equal),
+    // the most common and appropriate is "complete graph" (Cmplt): each part (rank)
+    // is equally reachable from any other (no communication topology bias).
     SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
     SCOTCH_archInit(&archdat);
-    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); // Partition into world_size complete nodes
+    // Partition into 'world_size' equally connected parts (each MPI rank is a "node")
+    // Other topology options could be substituted above according to your needs (see docs).
+    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
 
+
+
+    
+    // ===================== PT-Scotch Strategy Selection and Documentation ======================
+    // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
+    // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
+    //
+    // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation):
+    //
+    // - SCOTCH_STRATDEFAULT:     Use the default (fast, reasonable quality) partitioning strategy.
+    //                            Useful for quick, generic partitions where quality is not critical.
+    //
+    // - SCOTCH_STRATSPEED:       Aggressively maximizes speed (at the cost of cut quality).
+    //                            For large runs or test runs where speed is more important than minimizing edgecut.
+    //
+    // - SCOTCH_STRATQUALITY:     Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance).
+    //                            Slower than the default. Use when high-quality partitioning is desired.
+    //
+    // - SCOTCH_STRATBALANCE:     Tradeoff between speed and quality for balanced workload across partitions.
+    //                            Use if load balance is more critical than cut size.
+    //
+    // Additional Options:
+    // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}").
+    // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic).
+    // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality.
+    //
+    // Example usage:
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01);
+    //      ^ quality-focused, nparts=number of parts/ranks
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05);
+    //      ^ speed-focused, allow 5% imbalance
+    //
+    // Reference:
+    // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf
+    // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation.
+    //
+    // --------------- Set up the desired partitioning strategy here: ---------------
     SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
     SCOTCH_stratInit(&stratdat);
-    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); // zero is recursion count, 0=automatic
+
+    // Select partitioning strategy for this run:
+    // Use SCOTCH_STRATQUALITY for best cut quality.
+    // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
+    // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
+    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01);
 
     // partloctab: output array mapping each local element (vertex) to a *target partition number*
     // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
@@ -1070,7 +1113,7 @@ int main(int argc, char** argv) {
      * - Each local element's local index lid and global ID (gid) are listed with the
      *   part to which PT-Scotch has assigned them.
      ***************************************************************************/
-    print_info = true;
+    print_info = false;
     for(int rank_id = 0; rank_id < world_size; rank_id++) {
         if(rank_id == rank && print_info) {
             for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
@@ -1159,7 +1202,7 @@ int main(int argc, char** argv) {
             for (int i = 0; i < mesh.num_elems; ++i)
                 if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
 
-            for (int j = 0; j < nodes_per_elem; ++j) {
+            for (int j = 0; j < nodes_per_elem; j++) {
                 int node_lid = mesh.nodes_in_elem.host(lid, j);
                 int node_gid = mesh.local_to_global_node_mapping.host(node_lid);
                 conn_sendbuf.push_back(node_gid);
@@ -1216,7 +1259,7 @@ int main(int argc, char** argv) {
             for (int i = 0; i < mesh.num_elems; ++i)
                 if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
 
-            for (int j = 0; j < nodes_per_elem; ++j) {
+            for (int j = 0; j < nodes_per_elem; j++) {
                 int node_lid = mesh.nodes_in_elem.host(lid, j);
                 int node_gid = mesh.local_to_global_node_mapping.host(node_lid);
 
@@ -1308,7 +1351,7 @@ int main(int argc, char** argv) {
     std::map<int, std::array<double, 3>> node_gid_to_coords;
     int coord_idx = 0;
     for (int e = 0; e < num_new_elems; ++e) {
-        for (int j = 0; j < nodes_per_elem; ++j) {
+        for (int j = 0; j < nodes_per_elem; j++) {
             int node_gid = conn_recvbuf[e * nodes_per_elem + j];
             if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
                 node_gid_to_coords[node_gid] = {
@@ -1346,28 +1389,20 @@ int main(int argc, char** argv) {
 
     double t_ghost_start = MPI_Wtime();
     
-    // Update host arrays for ghost detection
-    final_mesh.local_to_global_elem_mapping.update_host();
-    final_mesh.local_to_global_node_mapping.update_host();
-    final_mesh.nodes_in_elem.update_host();
-    Kokkos::fence();
-    
-    // Build a set of locally-owned element global IDs for fast lookup
-    std::set<size_t> local_elem_gids;
-    for (int i = 0; i < num_new_elems; ++i) {
-        local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i));
-    }
-    
-    // Exchange element GIDs with all ranks to know who owns what
-    // Collect all locally-owned element global IDs to send to other ranks
-    std::vector<size_t> local_elem_gids_vec(local_elem_gids.begin(), local_elem_gids.end());
-    
     // First, gather the number of elements each rank owns
     std::vector<int> elem_counts(world_size);
-    int local_elem_count = static_cast<int>(local_elem_gids_vec.size());
-    
-    MPI_Allgather(&local_elem_count, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    
+
+    // int MPI_Allgather(
+    //     const void* sendbuf,      // Data to send from this process
+    //     int sendcount,            // Number of elements to send
+    //     MPI_Datatype sendtype,    // Type of send data
+    //     void* recvbuf,            // Buffer to receive all data
+    //     int recvcount,            // Number of elements to receive from each process
+    //     MPI_Datatype recvtype,    // Type of receive data
+    //     MPI_Comm comm             // Communicator
+    // );
+    MPI_Allgather(&final_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
     // Compute displacements
     std::vector<int> elem_displs(world_size);
     int total_elems = 0;
@@ -1378,10 +1413,21 @@ int main(int argc, char** argv) {
     
     // Gather all element GIDs from all ranks
     std::vector<size_t> all_elem_gids(total_elems);
-    MPI_Allgatherv(local_elem_gids_vec.data(), local_elem_count, MPI_UNSIGNED_LONG_LONG,
+
+    // int MPI_Allgatherv(
+    //     const void* sendbuf,      // Data to send from this process
+    //     int sendcount,            // Number of elements THIS process sends
+    //     MPI_Datatype sendtype,    // Type of send data
+    //     void* recvbuf,            // Buffer to receive all data
+    //     const int* recvcounts,    // Array: number of elements from each process
+    //     const int* displs,        // Array: displacement for each process's data
+    //     MPI_Datatype recvtype,    // Type of receive data
+    //     MPI_Comm comm             // Communicator
+    // );
+    MPI_Allgatherv(final_mesh.local_to_global_elem_mapping.host_pointer(), final_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
                    all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    
+    MPI_Barrier(MPI_COMM_WORLD);
     // Build a map: element GID -> owning rank
     std::map<size_t, int> elem_gid_to_rank;
     for (int r = 0; r < world_size; ++r) {
@@ -1400,7 +1446,7 @@ int main(int argc, char** argv) {
     // First, collect all nodes that belong to our locally-owned elements
     std::set<size_t> local_elem_nodes;
     for (int lid = 0; lid < num_new_elems; ++lid) {
-        for (int j = 0; j < nodes_per_elem; ++j) {
+        for (int j = 0; j < nodes_per_elem; j++) {
             size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
             size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
             local_elem_nodes.insert(node_gid);
@@ -1414,7 +1460,7 @@ int main(int argc, char** argv) {
     
     for (int lid = 0; lid < num_new_elems; ++lid) {
         size_t elem_gid = final_mesh.local_to_global_elem_mapping.host(lid);
-        for (int j = 0; j < nodes_per_elem; ++j) {
+        for (int j = 0; j < nodes_per_elem; j++) {
             size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
             size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
             elem_node_conn.push_back(elem_gid);
@@ -1427,7 +1473,7 @@ int main(int argc, char** argv) {
     // First, gather the sizes from each rank
     std::vector<int> conn_sizes(world_size);
     MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    
+    MPI_Barrier(MPI_COMM_WORLD);
     // Compute displacements
     std::vector<int> conn_displs(world_size);
     int total_conn = 0;
@@ -1441,6 +1487,12 @@ int main(int argc, char** argv) {
     MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
                    all_conn.data(), conn_sizes.data(), conn_displs.data(),
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    // create a set for local_elem_gids
+    std::set<size_t> local_elem_gids;
+    for (int i = 0; i < num_new_elems; ++i) {
+        local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i));
+    }
     
     // Build a map: node GID -> set of element GIDs that contain it (from other ranks)
     std::map<size_t, std::set<size_t>> node_to_ext_elem;
@@ -1506,35 +1558,448 @@ int main(int argc, char** argv) {
     }
     
     // Print ghost element info if requested
-    print_info = true;
+    print_info = false;
     for(int i = 0; i < world_size; i++) {
+        MPI_Barrier(MPI_COMM_WORLD);
         if(rank == i && print_info) {
             std::cout << "[rank " << rank << "] owns " << num_new_elems 
                   << " elements and has " << final_mesh.num_ghost_elems << " ghost elements" << std::endl;
             std::cout << "[rank " << rank << "] owned element global IDs: ";
-            for (int j = 0; j < num_new_elems; ++j) {
+            for (int j = 0; j < final_mesh.num_elems; j++) {
                 std::cout << final_mesh.local_to_global_elem_mapping(j) << " ";
             }
-            std::cout << std::endl;
-            
-            
-            
-            std::cout << "[rank " << rank << "] ghost element GIDs: ";
-            for (size_t gid : ghost_elem_gids) {
+
+            // Print global IDs of ghost elements
+            std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: ";
+            for (const auto& gid : ghost_elem_gids) {
                 std::cout << gid << " ";
             }
             std::cout << std::endl;
         }
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
+
+    // Build the connectivity that includes ghost elements
+    // Create an extended mesh with owned elements first, then ghost elements appended
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
+    
+    // Step 1: Extract ghost element-node connectivity from all_conn
+    // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn)
+    std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
+    for (const size_t& ghost_gid : ghost_elem_gids) {
+        ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem);
+    }
+    
+    // Extract nodes for each ghost element from all_conn
+    // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements
+    for (int r = 0; r < world_size; ++r) {
+        if (r == rank) continue;  // Skip our own data (we already have owned element connectivity)
+        int num_pairs = conn_sizes[r] / 2;
+        
+        // Process pairs in order - each element's nodes are contiguous
+        for (int i = 0; i < num_pairs; ++i) {
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this is one of our ghost elements, record its node (in order)
+            auto it = ghost_elem_to_nodes.find(elem_gid);
+            if (it != ghost_elem_to_nodes.end()) {
+                it->second.push_back(node_gid);
+            }
+        }
+    }
+    
+    // Verify each ghost element has the correct number of nodes
+    for (auto& pair : ghost_elem_to_nodes) {
+        if (pair.second.size() != static_cast<size_t>(nodes_per_elem)) {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
+                      << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl;
+        }
+    }
+    
+    // Step 2: Build extended node list (owned nodes first, then ghost-only nodes)
+    // Start with owned nodes
+    std::map<size_t, int> node_gid_to_extended_lid;
+    int extended_node_lid = 0;
+    
+    // Add all owned nodes
+    for (int i = 0; i < final_mesh.num_nodes; ++i) {
+        size_t node_gid = final_mesh.local_to_global_node_mapping.host(i);
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
+    
+    // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements)
+    std::set<size_t> ghost_only_nodes;
+    for (const auto& pair : ghost_elem_to_nodes) {
+        for (size_t node_gid : pair.second) {
+            // Check if we already have this node
+            if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) {
+                ghost_only_nodes.insert(node_gid);
+            }
+        }
+    }
+    
+    // Assign extended local IDs to ghost-only nodes
+    for (size_t node_gid : ghost_only_nodes) {
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
+    
+    int total_extended_nodes = extended_node_lid;
+    
+    // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
+    // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
+    std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
+    for (size_t node_gid : ghost_only_nodes) {
+        // Find which rank owns an element containing this node
+        // Look through ghost elements
+        for (const auto& pair : ghost_elem_to_nodes) {
+            size_t ghost_elem_gid = pair.first;
+            const std::vector<size_t>& nodes = pair.second;
+            bool found = false;
+            for (size_t ngid : nodes) {
+                if (ngid == node_gid) {
+                    found = true;
+                    break;
+                }
+            }
+            if (found) {
+                auto owner_it = elem_gid_to_rank.find(ghost_elem_gid);
+                if (owner_it != elem_gid_to_rank.end()) {
+                    rank_to_ghost_node_requests[owner_it->second].push_back(node_gid);
+                    break;
+                }
+            }
+        }
+    }
+    
+    // Step 4: Build extended element list and node connectivity
+    // Owned elements: 0 to num_new_elems-1 (already have these)
+    // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
+    
+    // Create extended element-node connectivity array
+    int total_extended_elems = final_mesh.num_elems + final_mesh.num_ghost_elems;
+    std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
+    
+    // Copy owned element connectivity (convert to extended node LIDs)
+    for (int lid = 0; lid < final_mesh.num_elems; ++lid) {
+        extended_nodes_in_elem[lid].reserve(nodes_per_elem);
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[lid].push_back(ext_lid);
+        }
+    }
+    
+    // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
+    int ghost_elem_ext_lid = final_mesh.num_elems;
+    std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
+    std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
+    
+    for (size_t ghost_gid : ghost_elem_gids_ordered) {
+        auto it = ghost_elem_to_nodes.find(ghost_gid);
+        if (it == ghost_elem_to_nodes.end()) continue;
+        
+        extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem);
+        for (size_t node_gid : it->second) {
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid);
+        }
+        ghost_elem_ext_lid++;
+    }
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Sequential rank-wise printing of extended mesh structure info
+    for (int r = 0; r < world_size; ++r) {
+        if (rank == r) {
+            std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
+            std::cout << "[rank " << rank << "]   - Owned elements: " << final_mesh.num_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Ghost elements: " << final_mesh.num_ghost_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Owned nodes: " << final_mesh.num_nodes << std::endl;
+            std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
+            std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
+            std::cout << std::flush;
+        }
         MPI_Barrier(MPI_COMM_WORLD);
     }
+    
+    // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
+    // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
+    
+    // Build reverse maps: extended_lid -> gid for nodes and elements
+    std::vector<size_t> extended_lid_to_node_gid(total_extended_nodes);
+    for (const auto& pair : node_gid_to_extended_lid) {
+        extended_lid_to_node_gid[pair.second] = pair.first;
+    }
+    
+    // Build extended element GID list: owned first, then ghost
+    std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
+    // Owned elements
+    for (int i = 0; i < final_mesh.num_elems; ++i) {
+        extended_lid_to_elem_gid[i] = final_mesh.local_to_global_elem_mapping.host(i);
+    }
+    // Ghost elements (in sorted order)
+    for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) {
+        extended_lid_to_elem_gid[final_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
+    }
+    
+    mesh_with_ghosts.initialize_nodes(total_extended_nodes);
+    mesh_with_ghosts.initialize_elems(total_extended_elems, 3);
+    mesh_with_ghosts.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
+    mesh_with_ghosts.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
+    for (int i = 0; i < total_extended_nodes; i++) {
+        mesh_with_ghosts.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
+    }
+    for (int i = 0; i < total_extended_elems; i++) {
+        mesh_with_ghosts.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
+    }
+    mesh_with_ghosts.local_to_global_node_mapping.update_device();
+    mesh_with_ghosts.local_to_global_elem_mapping.update_device();
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    // rebuild the local element-node connectivity using the local node ids
+    // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
+    for(int i = 0; i < total_extended_elems; i++) {
+        for(int j = 0; j < nodes_per_elem; j++) {
+            mesh_with_ghosts.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    mesh_with_ghosts.nodes_in_elem.update_device();
+
+    mesh_with_ghosts.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building extended mesh structure" << std::endl;
+
+    MPI_Barrier(MPI_COMM_WORLD);
 
+    node_with_ghosts.initialize(total_extended_nodes, 3, {node_state::coords});
     
+    // The goal here is to populate node_with_ghosts.coords using globally gathered ghost node coordinates,
+    // since final_node does not contain ghost node coordinates.
+    //
+    // Each rank will:
+    //  1. Gather coordinates of its owned nodes (from final_node).
+    //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
+    //     into a structure mapping global ID -> coordinate.
+    //  3. Use this map to fill node_with_ghosts.coords.
+
+    // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
+    std::vector<size_t> all_needed_node_gids(total_extended_nodes);
+    for (int i = 0; i < total_extended_nodes; ++i) {
+        all_needed_node_gids[i] = mesh_with_ghosts.local_to_global_node_mapping.host(i);
+    }
+
+    // 2. Build owned node GIDs and their coordinates
+    std::vector<size_t> owned_gids(final_mesh.num_nodes);
+    for (int i = 0; i < owned_gids.size(); ++i)
+        owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
+
+     // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
+     //    so we can distribute the needed coordinate data.
+     // The easiest is to Allgather everyone's "owned_gids" and coords
+ 
+     int local_owned_count = static_cast<int>(owned_gids.size());
+     std::vector<int> owned_counts(world_size, 0);
+     if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1
+
+    // a) Gather counts
+    owned_counts.resize(world_size, 0);
+    MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // b) Displacements and total
+    std::vector<int> owned_displs(world_size,0);
+    int total_owned = 0;
+    for (int r=0; r<world_size; ++r) {
+        owned_displs[r] = total_owned;
+        total_owned += owned_counts[r];
+    }
+
+    // c) Global GIDs (size: total_owned)
+    std::vector<size_t> all_owned_gids(total_owned);
+    MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG,
+                   all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+
+    // d) Global coords (size: total_owned x 3)
+    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
+    for (int i=0; i<local_owned_count; ++i) {
+        owned_coords_send[3*i+0] = final_node.coords.host(i,0);
+        owned_coords_send[3*i+1] = final_node.coords.host(i,1);
+        owned_coords_send[3*i+2] = final_node.coords.host(i,2);
+    }
+    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
+
+    // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
+    std::vector<int> coord_counts(world_size);
+    std::vector<int> coord_displs(world_size);
+    for (int r=0; r<world_size; ++r) {
+        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
+        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
+    }
+
+    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
+                   all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
+                   MPI_DOUBLE, MPI_COMM_WORLD);
+
+    // e) Build map: gid -> coord[3]
+    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
+    for (int i=0; i<total_owned; ++i) {
+        std::array<double,3> xyz = {
+            all_owned_coords[3*i+0],
+            all_owned_coords[3*i+1],
+            all_owned_coords[3*i+2]
+        };
+         gid_to_coord[all_owned_gids[i]] = xyz;
+    }
+
+    // 4. Finally, fill node_with_ghosts.coords with correct coordinates.
+    for (int i = 0; i < total_extended_nodes; ++i) {
+        size_t gid = mesh_with_ghosts.local_to_global_node_mapping.host(i);
+        auto it = gid_to_coord.find(gid);
+        if (it != gid_to_coord.end()) {
+            node_with_ghosts.coords.host(i,0) = it->second[0];
+            node_with_ghosts.coords.host(i,1) = it->second[1];
+            node_with_ghosts.coords.host(i,2) = it->second[2];
+        } else {
+            // Could happen if there's a bug: fill with zeros for safety
+            node_with_ghosts.coords.host(i,0) = 0.0;
+            node_with_ghosts.coords.host(i,1) = 0.0;
+            node_with_ghosts.coords.host(i,2) = 0.0;
+        }
+    }
+    node_with_ghosts.coords.update_device();
+
+
+
+
+    // --------------------------------------------------------------------------------------
+    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost element GIDs.
+    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(final_mesh.num_elems);
+
+    // Prepare local ghost list as vector
+    std::vector<size_t> ghost_gids_vec;
+    ghost_gids_vec.reserve(ghost_elem_gids.size());
+    for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g);
+
+    // Exchange counts
+    std::vector<int> ghost_counts(world_size, 0);
+    int local_ghost_count = static_cast<int>(ghost_gids_vec.size());
+    MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // Displacements and recv buffer
+    std::vector<int> ghost_displs(world_size, 0);
+    int total_ghosts = 0;
+    for (int r = 0; r < world_size; ++r) {
+        ghost_displs[r] = total_ghosts;
+        total_ghosts += ghost_counts[r];
+    }
+    std::vector<size_t> all_ghost_gids(total_ghosts);
+
+    // Gather ghost gids
+    MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG,
+                   all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl;
+    
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl;
+    // Build map gid -> ranks that ghost it
+    std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
+    gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
+    for (int r = 0; r < world_size; ++r) {
+        int cnt = ghost_counts[r];
+        int off = ghost_displs[r];
+        for (int i = 0; i < cnt; ++i) {
+            size_t g = all_ghost_gids[off + i];
+            gid_to_ghosting_ranks[g].push_back(r);
+        }
+    }
+
+    // For each local element, list destinations: ranks that ghost our gid
+    for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
+        size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
+        auto it = gid_to_ghosting_ranks.find(local_elem_gid);
+        if (it == gid_to_ghosting_ranks.end()) continue;
+        const std::vector<int> &dest_ranks = it->second;
+        for (int rr : dest_ranks) {
+            if (rr == rank) continue;
+            boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid));
+        }
+    }
+
+    std::cout.flush();
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Optional: print a compact summary of reverse map for verification (limited output)
+    for(int i = 0; i < world_size; i++) {
+        if (rank == i && print_info) {
+            std::cout << std::endl;
+            for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
+
+                size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
+                if (boundary_elem_targets[elem_lid].empty()) 
+                {
+                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl;
+                }
+                else
+                {
+                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: ";
+                    int shown = 0;
+                    for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                        if (shown >= 12) { std::cout << " ..."; break; }
+                        std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
+                        shown++;
+                    }
+                    std::cout << std::endl;
+                }
+            }
+            std::cout.flush();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    print_info = false;
 
     
     MPI_Barrier(MPI_COMM_WORLD);
 
 
 
+// NOTES:
+// We need to create communication maps for nodes, specifically an index list of 
+// -- Owned (nodes unique to this rank)
+// -- Shared (nodes on the boundary of this rank)
+// -- Ghost (nodes on the boundary of this rank that are owned by other ranks)
+
+
+// What we currently have is a communication plan for elements, eg. Each shared element (element on an MPI boundary) knows which rank and associated element global id on that rank it is connected to. 
+
+
+
+
+
 
 
 
@@ -1555,16 +2020,24 @@ int main(int argc, char** argv) {
     MPI_Barrier(MPI_COMM_WORLD);
 
 
-    write_vtk(final_mesh, final_node, rank);
+    // write_vtk(final_mesh, final_node, rank);
+    write_vtk(mesh_with_ghosts, node_with_ghosts, rank);
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Stop timer and get execution time
+    double t_main_end = MPI_Wtime();
+    
+    if(rank == 0) {
+        printf("Total execution time: %.2f seconds\n", t_main_end - t_main_start);
+    }
 
     } // end MATAR scope
     MATAR_FINALIZE();
     MPI_Finalize();
 
-     // Stop timer and get execution time
-    double time_ms = timer.stop();
-     
-    printf("Execution time: %.2f ms\n", time_ms);
+
 
     return 0;
 }
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index b044b599..2a704e14 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -276,7 +276,7 @@ void build_3d_box(
         Kokkos::fence();
 
 
-        const int num_cell_scalar_vars = 2;
+        const int num_cell_scalar_vars = 3;
         const int num_cell_vec_vars    = 0;
         const int num_cell_tensor_vars = 0;
 
@@ -286,7 +286,7 @@ void build_3d_box(
 
         // Scalar values associated with a cell
         const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
-            "rank_id", "elems_in_elem_owned"
+            "rank_id", "elems_in_elem_owned", "global_elem_id"
         };
         
         // const char cell_vec_var_names[num_cell_vec_vars][15] = {
@@ -315,11 +315,11 @@ void build_3d_box(
 
 
         // export material centeric data to the elements
-        elem_fields(0, 0) = rank;
 
         for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
             elem_fields(elem_gid, 0) = rank;
             elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
+            elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
         }
 
 

From 717271f99a6601912cc30c9516ffc12a828ddd3f Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 31 Oct 2025 14:38:58 -0500
Subject: [PATCH 12/52] ENH: Adding vtu output, and tidy up

---
 examples/mesh_decomp/mesh.h          |   5 +-
 examples/mesh_decomp/mesh_decomp.cpp |  29 ++-
 examples/mesh_decomp/mesh_io.h       | 309 +++++++++++++++++++++++++--
 3 files changed, 320 insertions(+), 23 deletions(-)

diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
index 92f3bcdf..a745e17e 100644
--- a/examples/mesh_decomp/mesh.h
+++ b/examples/mesh_decomp/mesh.h
@@ -310,9 +310,12 @@ struct Mesh_t
     DCArrayKokkos<size_t> local_to_global_node_mapping; ///< Local to global node mapping
     DCArrayKokkos<size_t> local_to_global_elem_mapping; ///< Local to global element mapping
 
-    // Data structure for ghost elements required for MPI comms
+    size_t num_owned_elems; ///< Number of owned elements on this rank
     size_t num_ghost_elems; ///< Number of ghost elements on this rank (from neighboring MPI ranks)
     
+    size_t num_owned_nodes; ///< Number of owned nodes on this rank
+    size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (from neighboring MPI ranks)
+    
 
 
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 85f2b7f2..45ffc2f5 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -84,7 +84,7 @@ int main(int argc, char** argv) {
     // Initial mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {4, 4, 1};
+    int num_elems_dim[3] = {25, 25, 25};
 
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
@@ -1720,7 +1720,7 @@ int main(int argc, char** argv) {
         if (rank == r) {
             std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
             std::cout << "[rank " << rank << "]   - Owned elements: " << final_mesh.num_elems << std::endl;
-            std::cout << "[rank " << rank << "]   - Ghost elements: " << final_mesh.num_ghost_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
             std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
             std::cout << "[rank " << rank << "]   - Owned nodes: " << final_mesh.num_nodes << std::endl;
             std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
@@ -1763,6 +1763,24 @@ int main(int argc, char** argv) {
     mesh_with_ghosts.local_to_global_node_mapping.update_device();
     mesh_with_ghosts.local_to_global_elem_mapping.update_device();
 
+    mesh_with_ghosts.num_ghost_elems = ghost_elem_gids.size();
+    mesh_with_ghosts.num_ghost_nodes = ghost_only_nodes.size();
+    
+    // Set owned counts for write_vtk (excludes ghost elements/nodes)
+    mesh_with_ghosts.num_owned_elems = final_mesh.num_elems;
+    mesh_with_ghosts.num_owned_nodes = final_mesh.num_nodes;
+
+
+    // Print num ghost elements and nodes on each rank sequentially
+    for (int r = 0; r < world_size; ++r) {
+        if (rank == r) {
+            std::cout << "*******[rank " << rank << "]   - Ghost elements: " << mesh_with_ghosts.num_ghost_elems << std::endl;
+            std::cout << "*******[rank " << rank << "]   - Ghost-only nodes: " << mesh_with_ghosts.num_ghost_nodes << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
 
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
@@ -1783,6 +1801,11 @@ int main(int argc, char** argv) {
 
     mesh_with_ghosts.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
+    
+    
+    
+    
+    
     if(rank == 0) std::cout << " Finished building extended mesh structure" << std::endl;
 
     MPI_Barrier(MPI_COMM_WORLD);
@@ -2021,7 +2044,7 @@ int main(int argc, char** argv) {
 
 
     // write_vtk(final_mesh, final_node, rank);
-    write_vtk(mesh_with_ghosts, node_with_ghosts, rank);
+    write_vtu(mesh_with_ghosts, node_with_ghosts, rank, MPI_COMM_WORLD);
 
 
     MPI_Barrier(MPI_COMM_WORLD);
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 2a704e14..c1be0881 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -280,7 +280,7 @@ void build_3d_box(
         const int num_cell_vec_vars    = 0;
         const int num_cell_tensor_vars = 0;
 
-        const int num_point_scalar_vars = 1;
+        const int num_point_scalar_vars = 2;
         const int num_point_vec_vars = 1;
 
 
@@ -294,7 +294,7 @@ void build_3d_box(
         // };
 
         const char point_scalar_var_names[num_point_scalar_vars][15] = {
-            "rank_id"
+            "rank_id", "elems_in_node"
         };
 
         const char point_vec_var_names[num_point_vec_vars][15] = {
@@ -302,10 +302,11 @@ void build_3d_box(
         };
 
         // short hand
-        const size_t num_nodes = mesh.num_nodes;
-        const size_t num_elems = mesh.num_elems;
+        const size_t num_nodes = mesh.num_owned_nodes;
+        const size_t num_elems = mesh.num_owned_elems;
         const size_t num_dims  = mesh.num_dims;
 
+
         // save the cell state to an array for exporting to graphics files
         auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
         int  elem_switch = 1;
@@ -316,7 +317,7 @@ void build_3d_box(
 
         // export material centeric data to the elements
 
-        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
             elem_fields(elem_gid, 0) = rank;
             elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
             elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
@@ -334,6 +335,11 @@ void build_3d_box(
             vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
 
             point_scalar_fields(node_gid, 0) = rank;
+            point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
+
+            if(node_gid == 0) {
+                std::cout << "*******[rank " << rank << "]   - num_corners_in_node: " << mesh.num_corners_in_node(node_gid) << std::endl;
+            }
         } // end for loop over vertices
 
 
@@ -362,10 +368,10 @@ void build_3d_box(
         fprintf(out[0], "ASCII \n");                      // part 3
         fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4
 
-        fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes);
+        fprintf(out[0], "POINTS %zu float\n", num_nodes);
 
         // write all components of the point coordinates
-        for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
             fprintf(out[0],
                     "%f %f %f\n",
                     node.coords.host(node_gid, 0),
@@ -380,7 +386,7 @@ void build_3d_box(
         */
 
         fprintf(out[0], "\n");
-        fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem);  // size=all printed values
+        fprintf(out[0], "CELLS %lu %lu\n", num_elems, num_elems + num_elems * mesh.num_nodes_in_elem);  // size=all printed values
 
         int Pn_order   = mesh.Pn;
         int order[3]   = { Pn_order, Pn_order, Pn_order };
@@ -388,7 +394,7 @@ void build_3d_box(
         // const int num_1D_points = Pn_order+1;
 
         // write all global point numbers for this elem
-        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
             fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem
 
             for (int k = 0; k <= Pn_order; k++) {
@@ -405,14 +411,14 @@ void build_3d_box(
 
         // Write the element types
         fprintf(out[0], "\n");
-        fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems);
+        fprintf(out[0], "CELL_TYPES %zu \n", num_elems);
         // VTK_LAGRANGE_HEXAHEDRON: 72,
         // VTK_HIGHER_ORDER_HEXAHEDRON: 67
         // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
         // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
         // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
         // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
-        for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
             fprintf(out[0], "%d \n", 72);
         }
 
@@ -423,12 +429,12 @@ void build_3d_box(
         */
 
         fprintf(out[0], "\n");
-        fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes);
+        fprintf(out[0], "POINT_DATA %zu \n", num_nodes);
 
         // vtk vector vars = (position, velocity)
         for (int var = 0; var < num_point_vec_vars; var++) {
             fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]);
-            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+            for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
                 fprintf(out[0], "%f %f %f\n",
                         vec_fields(node_gid, var, 0),
                         vec_fields(node_gid, var, 1),
@@ -437,15 +443,15 @@ void build_3d_box(
         } // end for vec_vars
 
 
-        // vtk scalar vars = (temp)
+        // vtk scalar vars = (rank_id, elems_in_node)
         for (int var = 0; var < num_point_scalar_vars; var++) {
             fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]);
             fprintf(out[0], "LOOKUP_TABLE default\n");
-            for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) {
+            for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
                 fprintf(out[0], "%f\n",
-                        point_scalar_fields(node_gid, 0));
+                        point_scalar_fields(node_gid, var));
             } // end for nodes
-        } // end for vec_vars
+        } // end for scalar_vars
 
         /*
         ---------------------------------------------------------------------------
@@ -453,12 +459,12 @@ void build_3d_box(
         ---------------------------------------------------------------------------
         */
         fprintf(out[0], "\n");
-        fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems);
+        fprintf(out[0], "CELL_DATA %zu \n", num_elems);
 
         for (int var = 0; var < num_cell_scalar_vars; var++) {
             fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4]
             fprintf(out[0], "LOOKUP_TABLE default\n");
-            for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) {
+            for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
                 fprintf(out[0], "%f\n",  elem_fields(elem_gid, var));
             } // end for elem
         } // end for cell scalar_vars
@@ -496,5 +502,270 @@ void build_3d_box(
     } // end write vtk old
 
 
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn write_vtu
+///
+/// \brief Writes a VTU (XML VTK) output file per MPI rank and a PVTU file
+///        for parallel visualization in ParaView
+///
+/// \param mesh mesh
+/// \param node node data
+/// \param rank MPI rank
+/// \param comm MPI communicator
+///
+/////////////////////////////////////////////////////////////////////////////
+void write_vtu(Mesh_t& mesh,
+               node_t& node,
+               int rank,
+               MPI_Comm comm)
+{
+    int world_size;
+    MPI_Comm_size(comm, &world_size);
+
+    CArray<double> graphics_times(1);
+    int graphics_id = 0;
+    graphics_times(0) = 0.0;
+
+    // ---- Update host data ----
+    node.coords.update_host();
+    Kokkos::fence();
+
+    const int num_cell_scalar_vars = 3;
+    const int num_cell_vec_vars    = 0;
+    const int num_cell_tensor_vars = 0;
+
+    const int num_point_scalar_vars = 2;
+    const int num_point_vec_vars = 1;
+
+    // Scalar values associated with a cell
+    const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
+        "rank_id", "elems_in_elem_owned", "global_elem_id"
+    };
+
+    const char point_scalar_var_names[num_point_scalar_vars][15] = {
+        "rank_id", "elems_in_node"
+    };
+
+    const char point_vec_var_names[num_point_vec_vars][15] = {
+        "pos"
+    };
+
+    // short hand
+    const size_t num_nodes = mesh.num_owned_nodes;
+    const size_t num_elems = mesh.num_owned_elems;
+    const size_t num_dims  = mesh.num_dims;
+
+    // save the cell state to an array for exporting to graphics files
+    auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
+
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        elem_fields(elem_gid, 0) = rank;
+        elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
+        elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
+    }
+
+    // save the vertex vector fields to an array for exporting to graphics files
+    CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
+    CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
+
+    for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+        // position, var 0
+        vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0);
+        vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1);
+        vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
+
+        point_scalar_fields(node_gid, 0) = rank;
+        point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
+    }
+
+    // File management
+    char filename[200];
+    int max_len = sizeof filename;
+    int str_output_len;
+
+    struct stat st;
+    if (stat("vtk", &st) != 0) {
+        system("mkdir vtk");
+    }
+
+    // Create VTU filename for this rank
+    str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtu", graphics_id, rank);
+    if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+    FILE* vtu_file = fopen(filename, "w");
+    if (!vtu_file) {
+        std::cerr << "[rank " << rank << "] Failed to open VTU file: " << filename << std::endl;
+        return;
+    }
+
+    // Write VTU XML header
+    fprintf(vtu_file, "<?xml version=\"1.0\"?>\n");
+    fprintf(vtu_file, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n");
+    fprintf(vtu_file, "  <UnstructuredGrid>\n");
+    fprintf(vtu_file, "    <Piece NumberOfPoints=\"%zu\" NumberOfCells=\"%zu\">\n", num_nodes, num_elems);
+
+    // Write Points (coordinates)
+    fprintf(vtu_file, "      <Points>\n");
+    fprintf(vtu_file, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n");
+    for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+        fprintf(vtu_file, "          %f %f %f\n",
+                node.coords.host(node_gid, 0),
+                node.coords.host(node_gid, 1),
+                node.coords.host(node_gid, 2));
+    }
+    fprintf(vtu_file, "        </DataArray>\n");
+    fprintf(vtu_file, "      </Points>\n");
+
+    // Write Cells (connectivity)
+    fprintf(vtu_file, "      <Cells>\n");
+    
+    // Connectivity array - all node indices for all cells, space-separated
+    fprintf(vtu_file, "        <DataArray type=\"Int32\" Name=\"connectivity\" format=\"ascii\">\n");
+    int Pn_order = mesh.Pn;
+    int order[3] = { Pn_order, Pn_order, Pn_order };
+    
+    // Write connectivity: all node IDs for all elements, space-separated
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        for (int k = 0; k <= Pn_order; k++) {
+            for (int j = 0; j <= Pn_order; j++) {
+                for (int i = 0; i <= Pn_order; i++) {
+                    size_t node_lid = PointIndexFromIJK(i, j, k, order);
+                    size_t node_idx = mesh.nodes_in_elem.host(elem_gid, node_lid);
+                    // Cast to int for Int32 format (valid for node indices < 2^31)
+                    fprintf(vtu_file, " %d", static_cast<int>(node_idx));
+                }
+            }
+        }
+    }
+    fprintf(vtu_file, "\n");
+    fprintf(vtu_file, "        </DataArray>\n");
+
+    // Offsets array - cumulative index where each cell's connectivity ends
+    fprintf(vtu_file, "        <DataArray type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");
+    int offset = 0;
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        offset += static_cast<int>(mesh.num_nodes_in_elem);
+        fprintf(vtu_file, " %d", offset);
+    }
+    fprintf(vtu_file, "\n");
+    fprintf(vtu_file, "        </DataArray>\n");
+
+    // Types array (72 = VTK_LAGRANGE_HEXAHEDRON)
+    fprintf(vtu_file, "        <DataArray type=\"UInt8\" Name=\"types\" format=\"ascii\">\n");
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        fprintf(vtu_file, " 72");
+    }
+    fprintf(vtu_file, "\n");
+    fprintf(vtu_file, "        </DataArray>\n");
+    fprintf(vtu_file, "      </Cells>\n");
+
+    // Write PointData (node fields)
+    fprintf(vtu_file, "      <PointData>\n");
+    
+    // Point vector variables
+    for (int var = 0; var < num_point_vec_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\" format=\"ascii\">\n", 
+                point_vec_var_names[var]);
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(vtu_file, "          %f %f %f\n",
+                    vec_fields(node_gid, var, 0),
+                    vec_fields(node_gid, var, 1),
+                    vec_fields(node_gid, var, 2));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+
+    // Point scalar variables
+    for (int var = 0; var < num_point_scalar_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", 
+                point_scalar_var_names[var]);
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(vtu_file, "          %f\n", point_scalar_fields(node_gid, var));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+    fprintf(vtu_file, "      </PointData>\n");
+
+    // Write CellData (element fields)
+    fprintf(vtu_file, "      <CellData>\n");
+    for (int var = 0; var < num_cell_scalar_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", 
+                cell_scalar_var_names[var]);
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            fprintf(vtu_file, "          %f\n", elem_fields(elem_gid, var));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+    fprintf(vtu_file, "      </CellData>\n");
+
+    // Close VTU file
+    fprintf(vtu_file, "    </Piece>\n");
+    fprintf(vtu_file, "  </UnstructuredGrid>\n");
+    fprintf(vtu_file, "</VTKFile>\n");
+    fclose(vtu_file);
+
+    // Write PVTU file (only rank 0, after all ranks have written their VTU files)
+    MPI_Barrier(comm);
+    
+    if (rank == 0) {
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.pvtu", graphics_id);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+        FILE* pvtu_file = fopen(filename, "w");
+        if (!pvtu_file) {
+            std::cerr << "[rank 0] Failed to open PVTU file: " << filename << std::endl;
+            return;
+        }
+
+        // Write PVTU XML header
+        fprintf(pvtu_file, "<?xml version=\"1.0\"?>\n");
+        fprintf(pvtu_file, "<VTKFile type=\"PUnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n");
+        fprintf(pvtu_file, "  <PUnstructuredGrid GhostLevel=\"0\">\n");
+        
+        // Write PPoints
+        fprintf(pvtu_file, "    <PPoints>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"Float32\" NumberOfComponents=\"3\"/>\n");
+        fprintf(pvtu_file, "    </PPoints>\n");
+
+        // Write PCells
+        fprintf(pvtu_file, "    <PCells>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"Int32\" Name=\"connectivity\"/>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"Int32\" Name=\"offsets\"/>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"UInt8\" Name=\"types\"/>\n");
+        fprintf(pvtu_file, "    </PCells>\n");
+
+        // Write PPointData
+        fprintf(pvtu_file, "    <PPointData>\n");
+        for (int var = 0; var < num_point_vec_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\"/>\n",
+                    point_vec_var_names[var]);
+        }
+        for (int var = 0; var < num_point_scalar_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\"/>\n",
+                    point_scalar_var_names[var]);
+        }
+        fprintf(pvtu_file, "    </PPointData>\n");
+
+        // Write PCellData
+        fprintf(pvtu_file, "    <PCellData>\n");
+        for (int var = 0; var < num_cell_scalar_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\"/>\n",
+                    cell_scalar_var_names[var]);
+        }
+        fprintf(pvtu_file, "    </PCellData>\n");
+
+        // Write Piece references for each rank
+        for (int r = 0; r < world_size; r++) {
+            fprintf(pvtu_file, "    <Piece Source=\"Fierro.%05d_rank%d.vtu\"/>\n", graphics_id, r);
+        }
+
+        // Close PVTU file
+        fprintf(pvtu_file, "  </PUnstructuredGrid>\n");
+        fprintf(pvtu_file, "</VTKFile>\n");
+        fclose(pvtu_file);
+    }
+
+} // end write_vtu
 
 #endif
\ No newline at end of file

From 761faef6bd82d2262cb797e53d93322494239c9e Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 31 Oct 2025 14:51:04 -0500
Subject: [PATCH 13/52] ENH: Cleaning up, WIP

---
 examples/mesh_decomp/decomp_utils.h  |   0
 examples/mesh_decomp/mesh_decomp.cpp | 241 ++++++++++++---------------
 examples/mesh_decomp/mesh_io.h       |   4 +-
 3 files changed, 103 insertions(+), 142 deletions(-)
 create mode 100644 examples/mesh_decomp/decomp_utils.h

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 45ffc2f5..4cc6da9e 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -26,20 +26,6 @@ void calc_elements_per_rank(std::vector<int>& elems_per_rank, int num_elems, int
     }
 }
 
-void print_mesh_info(Mesh_t& mesh){
-    std::cout<<"Mesh has "<<mesh.num_elems<<" elements"<<std::endl;
-    std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
-
-    for (int i = 0; i < mesh.num_elems; i++) {
-        std::cout<<"Element "<<i<<" has nodes: ";
-        for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
-            std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
-        }
-        std::cout<<std::endl;
-    }
-    std::cout<<std::endl;
-}
-
 void print_rank_mesh_info(Mesh_t& mesh, int rank) {
 
     std::cout<<std::endl;
@@ -77,7 +63,6 @@ int main(int argc, char** argv) {
     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-
     double t_main_start = MPI_Wtime();
 
 
@@ -86,6 +71,10 @@ int main(int argc, char** argv) {
     double length[3] = {1.0, 1.0, 1.0};
     int num_elems_dim[3] = {25, 25, 25};
 
+
+
+
+
     Mesh_t initial_mesh;
     GaussPoint_t initial_GaussPoints;
     node_t initial_node;
@@ -93,15 +82,15 @@ int main(int argc, char** argv) {
     // Create mesh, gauss points, and node data structures on each rank
     // This is the initial partitioned mesh
     Mesh_t mesh;
-    GaussPoint_t GaussPoints;
     node_t node;
 
-    // Mesh partitioned by pt-scotch
-    Mesh_t final_mesh; 
-    node_t final_node;
+    // Mesh partitioned by pt-scotch, not including ghost
+    Mesh_t intermediate_mesh; 
+    node_t intermediate_node;
 
-    Mesh_t mesh_with_ghosts;
-    node_t node_with_ghosts;
+    // Mesh partitioned by pt-scotch, including ghost
+    Mesh_t final_mesh;
+    node_t final_node;
 
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
@@ -131,8 +120,8 @@ int main(int argc, char** argv) {
 // ********************************************************  
 //              Build the initial mesh
 // ********************************************************  
-    double t_init_mesh_start = MPI_Wtime();
 
+    double t_init_mesh_start = MPI_Wtime();
     if (rank == 0) {
         std::cout<<"World size: "<<world_size<<std::endl;
         std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
@@ -142,16 +131,10 @@ int main(int argc, char** argv) {
 
         num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
 
-        // print out the nodes associated with each element in the initial mesh
-        if (print_info) {
-            print_mesh_info(initial_mesh);
-        }
-
         // Compute elements to send to each rank; handle remainders for non-even distribution
         calc_elements_per_rank(elems_per_rank, initial_mesh.num_elems, world_size);
     }
 
-    // int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
     MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -159,6 +142,7 @@ int main(int argc, char** argv) {
     if (rank == 0) {
         std::cout << "Initial mesh generation + broadcast took " << (t_init_mesh_end - t_init_mesh_start) << " seconds." << std::endl;
     }
+
     
 // ********************************************************  
 //        Scatter the number of elements to each rank
@@ -1295,20 +1279,20 @@ int main(int argc, char** argv) {
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
 
-    // -------------- Phase 6: Build the final_mesh --------------
-    final_mesh.initialize_nodes(num_new_nodes);
-    final_mesh.initialize_elems(num_new_elems, mesh.num_dims);
-    final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
-    final_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
+    // -------------- Phase 6: Build the intermediate_mesh --------------
+    intermediate_mesh.initialize_nodes(num_new_nodes);
+    intermediate_mesh.initialize_elems(num_new_elems, mesh.num_dims);
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
 
     // Fill global mappings
     for (int i = 0; i < num_new_nodes; ++i)
-        final_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
+        intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
     for (int i = 0; i < num_new_elems; ++i)
-        final_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
+        intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
 
-    final_mesh.local_to_global_node_mapping.update_device();
-    final_mesh.local_to_global_elem_mapping.update_device();
+    intermediate_mesh.local_to_global_node_mapping.update_device();
+    intermediate_mesh.local_to_global_elem_mapping.update_device();
 
 
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1325,7 +1309,7 @@ int main(int argc, char** argv) {
             int left = 0, right = num_new_nodes - 1;
             while (left <= right) {
                 int mid = left + (right - left) / 2;
-                size_t mid_gid = final_mesh.local_to_global_node_mapping.host(mid);
+                size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid);
                 if (node_gid == mid_gid) {
                     node_lid = mid;
                     break;
@@ -1336,14 +1320,14 @@ int main(int argc, char** argv) {
                 }
             }
 
-            final_mesh.nodes_in_elem.host(i, j) = node_lid;
+            intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
         }
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
 
-    final_mesh.nodes_in_elem.update_device();
+    intermediate_mesh.nodes_in_elem.update_device();
 
     // Fill node coordinates
     // coord_recvbuf contains coords in element-node order, but we need them in node order
@@ -1365,20 +1349,20 @@ int main(int argc, char** argv) {
     }
     
     // Now fill coordinates in node order
-    final_node.initialize(num_new_nodes, 3, {node_state::coords});
+    intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
     for (int i = 0; i < num_new_nodes; ++i) {
         int node_gid = new_node_gids[i];
         auto it = node_gid_to_coords.find(node_gid);
         if (it != node_gid_to_coords.end()) {
-            final_node.coords.host(i, 0) = it->second[0];
-            final_node.coords.host(i, 1) = it->second[1];
-            final_node.coords.host(i, 2) = it->second[2];
+            intermediate_node.coords.host(i, 0) = it->second[0];
+            intermediate_node.coords.host(i, 1) = it->second[1];
+            intermediate_node.coords.host(i, 2) = it->second[2];
         }
     }
-    final_node.coords.update_device();
+    intermediate_node.coords.update_device();
 
     // Connectivity rebuild
-    final_mesh.build_connectivity();
+    intermediate_mesh.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
 
 
@@ -1401,7 +1385,7 @@ int main(int argc, char** argv) {
     //     MPI_Datatype recvtype,    // Type of receive data
     //     MPI_Comm comm             // Communicator
     // );
-    MPI_Allgather(&final_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
     // Compute displacements
     std::vector<int> elem_displs(world_size);
@@ -1424,7 +1408,7 @@ int main(int argc, char** argv) {
     //     MPI_Datatype recvtype,    // Type of receive data
     //     MPI_Comm comm             // Communicator
     // );
-    MPI_Allgatherv(final_mesh.local_to_global_elem_mapping.host_pointer(), final_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
+    MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
                    all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1447,8 +1431,8 @@ int main(int argc, char** argv) {
     std::set<size_t> local_elem_nodes;
     for (int lid = 0; lid < num_new_elems; ++lid) {
         for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
             local_elem_nodes.insert(node_gid);
         }
     }
@@ -1459,10 +1443,10 @@ int main(int argc, char** argv) {
     int local_conn_size = 0;
     
     for (int lid = 0; lid < num_new_elems; ++lid) {
-        size_t elem_gid = final_mesh.local_to_global_elem_mapping.host(lid);
+        size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid);
         for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
             elem_node_conn.push_back(elem_gid);
             elem_node_conn.push_back(node_gid);
         }
@@ -1491,7 +1475,7 @@ int main(int argc, char** argv) {
     // create a set for local_elem_gids
     std::set<size_t> local_elem_gids;
     for (int i = 0; i < num_new_elems; ++i) {
-        local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i));
+        local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i));
     }
     
     // Build a map: node GID -> set of element GIDs that contain it (from other ranks)
@@ -1528,13 +1512,13 @@ int main(int argc, char** argv) {
     // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
     
     for (int lid = 0; lid < num_new_elems; ++lid) {
-        size_t num_neighbors = final_mesh.num_elems_in_elem(lid);
+        size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid);
         
         for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
-            size_t neighbor_lid = final_mesh.elems_in_elem(lid, nbr_idx);
+            size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx);
             
             if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
-                size_t neighbor_gid = final_mesh.local_to_global_elem_mapping(neighbor_lid);
+                size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid);
                 
                 // Check if neighbor is owned by this rank
                 auto it = elem_gid_to_rank.find(neighbor_gid);
@@ -1547,7 +1531,7 @@ int main(int argc, char** argv) {
     }
     
     // Count unique ghost elements
-    final_mesh.num_ghost_elems = ghost_elem_gids.size();
+    intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
     
     MPI_Barrier(MPI_COMM_WORLD);
     double t_ghost_end = MPI_Wtime();
@@ -1563,10 +1547,10 @@ int main(int argc, char** argv) {
         MPI_Barrier(MPI_COMM_WORLD);
         if(rank == i && print_info) {
             std::cout << "[rank " << rank << "] owns " << num_new_elems 
-                  << " elements and has " << final_mesh.num_ghost_elems << " ghost elements" << std::endl;
+                  << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl;
             std::cout << "[rank " << rank << "] owned element global IDs: ";
-            for (int j = 0; j < final_mesh.num_elems; j++) {
-                std::cout << final_mesh.local_to_global_elem_mapping(j) << " ";
+            for (int j = 0; j < intermediate_mesh.num_elems; j++) {
+                std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " ";
             }
 
             // Print global IDs of ghost elements
@@ -1629,8 +1613,8 @@ int main(int argc, char** argv) {
     int extended_node_lid = 0;
     
     // Add all owned nodes
-    for (int i = 0; i < final_mesh.num_nodes; ++i) {
-        size_t node_gid = final_mesh.local_to_global_node_mapping.host(i);
+    for (int i = 0; i < intermediate_mesh.num_nodes; ++i) {
+        size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i);
         node_gid_to_extended_lid[node_gid] = extended_node_lid++;
     }
     
@@ -1683,22 +1667,22 @@ int main(int argc, char** argv) {
     // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
     
     // Create extended element-node connectivity array
-    int total_extended_elems = final_mesh.num_elems + final_mesh.num_ghost_elems;
+    int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems;
     std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
     
     // Copy owned element connectivity (convert to extended node LIDs)
-    for (int lid = 0; lid < final_mesh.num_elems; ++lid) {
+    for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) {
         extended_nodes_in_elem[lid].reserve(nodes_per_elem);
         for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = final_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid);
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
             int ext_lid = node_gid_to_extended_lid[node_gid];
             extended_nodes_in_elem[lid].push_back(ext_lid);
         }
     }
     
     // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
-    int ghost_elem_ext_lid = final_mesh.num_elems;
+    int ghost_elem_ext_lid = intermediate_mesh.num_elems;
     std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
     std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
     
@@ -1719,10 +1703,10 @@ int main(int argc, char** argv) {
     for (int r = 0; r < world_size; ++r) {
         if (rank == r) {
             std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
-            std::cout << "[rank " << rank << "]   - Owned elements: " << final_mesh.num_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
             std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
             std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
-            std::cout << "[rank " << rank << "]   - Owned nodes: " << final_mesh.num_nodes << std::endl;
+            std::cout << "[rank " << rank << "]   - Owned nodes: " << intermediate_mesh.num_nodes << std::endl;
             std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
             std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
             std::cout << std::flush;
@@ -1742,40 +1726,40 @@ int main(int argc, char** argv) {
     // Build extended element GID list: owned first, then ghost
     std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
     // Owned elements
-    for (int i = 0; i < final_mesh.num_elems; ++i) {
-        extended_lid_to_elem_gid[i] = final_mesh.local_to_global_elem_mapping.host(i);
+    for (int i = 0; i < intermediate_mesh.num_elems; ++i) {
+        extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i);
     }
     // Ghost elements (in sorted order)
     for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) {
-        extended_lid_to_elem_gid[final_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
+        extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
     }
     
-    mesh_with_ghosts.initialize_nodes(total_extended_nodes);
-    mesh_with_ghosts.initialize_elems(total_extended_elems, 3);
-    mesh_with_ghosts.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
-    mesh_with_ghosts.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
+    final_mesh.initialize_nodes(total_extended_nodes);
+    final_mesh.initialize_elems(total_extended_elems, 3);
+    final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
+    final_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
     for (int i = 0; i < total_extended_nodes; i++) {
-        mesh_with_ghosts.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
+        final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
     }
     for (int i = 0; i < total_extended_elems; i++) {
-        mesh_with_ghosts.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
+        final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
     }
-    mesh_with_ghosts.local_to_global_node_mapping.update_device();
-    mesh_with_ghosts.local_to_global_elem_mapping.update_device();
+    final_mesh.local_to_global_node_mapping.update_device();
+    final_mesh.local_to_global_elem_mapping.update_device();
 
-    mesh_with_ghosts.num_ghost_elems = ghost_elem_gids.size();
-    mesh_with_ghosts.num_ghost_nodes = ghost_only_nodes.size();
+    final_mesh.num_ghost_elems = ghost_elem_gids.size();
+    final_mesh.num_ghost_nodes = ghost_only_nodes.size();
     
     // Set owned counts for write_vtk (excludes ghost elements/nodes)
-    mesh_with_ghosts.num_owned_elems = final_mesh.num_elems;
-    mesh_with_ghosts.num_owned_nodes = final_mesh.num_nodes;
+    final_mesh.num_owned_elems = intermediate_mesh.num_elems;
+    final_mesh.num_owned_nodes = intermediate_mesh.num_nodes;
 
 
     // Print num ghost elements and nodes on each rank sequentially
     for (int r = 0; r < world_size; ++r) {
         if (rank == r) {
-            std::cout << "*******[rank " << rank << "]   - Ghost elements: " << mesh_with_ghosts.num_ghost_elems << std::endl;
-            std::cout << "*******[rank " << rank << "]   - Ghost-only nodes: " << mesh_with_ghosts.num_ghost_nodes << std::endl;
+            std::cout << "*******[rank " << rank << "]   - Ghost elements: " << final_mesh.num_ghost_elems << std::endl;
+            std::cout << "*******[rank " << rank << "]   - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl;
         }
         MPI_Barrier(MPI_COMM_WORLD);
     }
@@ -1790,16 +1774,16 @@ int main(int argc, char** argv) {
     // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
     for(int i = 0; i < total_extended_elems; i++) {
         for(int j = 0; j < nodes_per_elem; j++) {
-            mesh_with_ghosts.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
+            final_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
         }
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
 
-    mesh_with_ghosts.nodes_in_elem.update_device();
+    final_mesh.nodes_in_elem.update_device();
 
-    mesh_with_ghosts.build_connectivity();
+    final_mesh.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
     
     
@@ -1810,27 +1794,27 @@ int main(int argc, char** argv) {
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    node_with_ghosts.initialize(total_extended_nodes, 3, {node_state::coords});
+    final_node.initialize(total_extended_nodes, 3, {node_state::coords});
     
-    // The goal here is to populate node_with_ghosts.coords using globally gathered ghost node coordinates,
-    // since final_node does not contain ghost node coordinates.
+    // The goal here is to populate final_node.coords using globally gathered ghost node coordinates,
+    // since intermediate_node does not contain ghost node coordinates.
     //
     // Each rank will:
-    //  1. Gather coordinates of its owned nodes (from final_node).
+    //  1. Gather coordinates of its owned nodes (from intermediate_node).
     //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
     //     into a structure mapping global ID -> coordinate.
-    //  3. Use this map to fill node_with_ghosts.coords.
+    //  3. Use this map to fill final_node.coords.
 
     // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
     std::vector<size_t> all_needed_node_gids(total_extended_nodes);
     for (int i = 0; i < total_extended_nodes; ++i) {
-        all_needed_node_gids[i] = mesh_with_ghosts.local_to_global_node_mapping.host(i);
+        all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
     }
 
     // 2. Build owned node GIDs and their coordinates
-    std::vector<size_t> owned_gids(final_mesh.num_nodes);
+    std::vector<size_t> owned_gids(intermediate_mesh.num_nodes);
     for (int i = 0; i < owned_gids.size(); ++i)
-        owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
+        owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i);
 
      // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
      //    so we can distribute the needed coordinate data.
@@ -1861,9 +1845,9 @@ int main(int argc, char** argv) {
     // d) Global coords (size: total_owned x 3)
     std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
     for (int i=0; i<local_owned_count; ++i) {
-        owned_coords_send[3*i+0] = final_node.coords.host(i,0);
-        owned_coords_send[3*i+1] = final_node.coords.host(i,1);
-        owned_coords_send[3*i+2] = final_node.coords.host(i,2);
+        owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0);
+        owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1);
+        owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2);
     }
     std::vector<double> all_owned_coords(3 * total_owned, 0.0);
 
@@ -1890,22 +1874,22 @@ int main(int argc, char** argv) {
          gid_to_coord[all_owned_gids[i]] = xyz;
     }
 
-    // 4. Finally, fill node_with_ghosts.coords with correct coordinates.
+    // 4. Finally, fill final_node.coords with correct coordinates.
     for (int i = 0; i < total_extended_nodes; ++i) {
-        size_t gid = mesh_with_ghosts.local_to_global_node_mapping.host(i);
+        size_t gid = final_mesh.local_to_global_node_mapping.host(i);
         auto it = gid_to_coord.find(gid);
         if (it != gid_to_coord.end()) {
-            node_with_ghosts.coords.host(i,0) = it->second[0];
-            node_with_ghosts.coords.host(i,1) = it->second[1];
-            node_with_ghosts.coords.host(i,2) = it->second[2];
+            final_node.coords.host(i,0) = it->second[0];
+            final_node.coords.host(i,1) = it->second[1];
+            final_node.coords.host(i,2) = it->second[2];
         } else {
             // Could happen if there's a bug: fill with zeros for safety
-            node_with_ghosts.coords.host(i,0) = 0.0;
-            node_with_ghosts.coords.host(i,1) = 0.0;
-            node_with_ghosts.coords.host(i,2) = 0.0;
+            final_node.coords.host(i,0) = 0.0;
+            final_node.coords.host(i,1) = 0.0;
+            final_node.coords.host(i,2) = 0.0;
         }
     }
-    node_with_ghosts.coords.update_device();
+    final_node.coords.update_device();
 
 
 
@@ -1917,7 +1901,7 @@ int main(int argc, char** argv) {
     // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
     // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
     // --------------------------------------------------------------------------------------
-    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(final_mesh.num_elems);
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(intermediate_mesh.num_elems);
 
     // Prepare local ghost list as vector
     std::vector<size_t> ghost_gids_vec;
@@ -1962,8 +1946,8 @@ int main(int argc, char** argv) {
     }
 
     // For each local element, list destinations: ranks that ghost our gid
-    for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
-        size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
+    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+        size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
         auto it = gid_to_ghosting_ranks.find(local_elem_gid);
         if (it == gid_to_ghosting_ranks.end()) continue;
         const std::vector<int> &dest_ranks = it->second;
@@ -1979,9 +1963,9 @@ int main(int argc, char** argv) {
     for(int i = 0; i < world_size; i++) {
         if (rank == i && print_info) {
             std::cout << std::endl;
-            for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
+            for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
 
-                size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
+                size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
                 if (boundary_elem_targets[elem_lid].empty()) 
                 {
                     std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl;
@@ -2010,41 +1994,20 @@ int main(int argc, char** argv) {
 
 
 
-// NOTES:
-// We need to create communication maps for nodes, specifically an index list of 
-// -- Owned (nodes unique to this rank)
-// -- Shared (nodes on the boundary of this rank)
-// -- Ghost (nodes on the boundary of this rank that are owned by other ranks)
-
-
-// What we currently have is a communication plan for elements, eg. Each shared element (element on an MPI boundary) knows which rank and associated element global id on that rank it is connected to. 
-
-
-
-
-
-
-
-
-
-
-
-
-
 
 
 
     for(int i = 0; i < world_size; i++) {
         if(rank == i && print_info) {
-            print_rank_mesh_info(final_mesh, i);
+            print_rank_mesh_info(intermediate_mesh, i);
         }
         MPI_Barrier(MPI_COMM_WORLD);
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
 
-    // write_vtk(final_mesh, final_node, rank);
-    write_vtu(mesh_with_ghosts, node_with_ghosts, rank, MPI_COMM_WORLD);
+    // write_vtk(intermediate_mesh, intermediate_node, rank);
+    write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD);
 
 
     MPI_Barrier(MPI_COMM_WORLD);
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index c1be0881..7e6f6c83 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -631,9 +631,7 @@ void write_vtu(Mesh_t& mesh,
             for (int j = 0; j <= Pn_order; j++) {
                 for (int i = 0; i <= Pn_order; i++) {
                     size_t node_lid = PointIndexFromIJK(i, j, k, order);
-                    size_t node_idx = mesh.nodes_in_elem.host(elem_gid, node_lid);
-                    // Cast to int for Int32 format (valid for node indices < 2^31)
-                    fprintf(vtu_file, " %d", static_cast<int>(node_idx));
+                    fprintf(vtu_file, " %zu", static_cast<unsigned long>(mesh.nodes_in_elem.host(elem_gid, node_lid)));
                 }
             }
         }

From bdd0c1928301c8c43f9b292b1a85e790d24c13ca Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 31 Oct 2025 15:53:20 -0500
Subject: [PATCH 14/52] ENH: Tidying up main

---
 examples/mesh_decomp/decomp_utils.h  | 1956 +++++++++++++++++++++++++
 examples/mesh_decomp/mesh_decomp.cpp | 1987 +-------------------------
 examples/mesh_decomp/mesh_io.h       |   20 -
 3 files changed, 1976 insertions(+), 1987 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index e69de29b..0357b6a6 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -0,0 +1,1956 @@
+#ifndef DECOMP_UTILS_H
+#define DECOMP_UTILS_H
+
+#include <iostream>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <memory>
+#include <mpi.h>
+#include <set>
+#include <map>
+
+
+#include "mesh.h"
+#include "state.h"
+#include "mesh_io.h"
+
+// Include Scotch headers
+#include "scotch.h"
+#include "ptscotch.h"
+
+void partition_mesh(
+    Mesh_t& initial_mesh,
+    Mesh_t& final_mesh,
+    node_t& initial_node,
+    node_t& final_node,
+    int world_size,
+    int rank){
+
+    bool print_info = false;
+    bool print_vtk = false;
+
+    // Create mesh, gauss points, and node data structures on each rank
+    // This is the initial partitioned mesh
+    Mesh_t naive_mesh;
+    node_t naive_node;
+
+    // Mesh partitioned by pt-scotch, not including ghost
+    Mesh_t intermediate_mesh; 
+    node_t intermediate_node;
+
+    int num_elements_on_rank = 0;
+    int num_nodes_on_rank = 0;
+
+    int num_nodes_per_elem = 0;
+
+    std::vector<int> elements_on_rank;  
+    std::vector<int> nodes_on_rank;
+
+
+    std::vector<int> elems_per_rank(world_size); // number of elements to send to each rank size(world_size)
+    std::vector<int> nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size)
+
+    // create a 2D vector of elements to send to each rank
+    std::vector<std::vector<int>> elements_to_send(world_size);
+
+    // create a 2D vector of nodes to send to each rank
+    std::vector<std::vector<int>> nodes_to_send(world_size);
+
+    // Create a 2D vector to hold the nodal positions on each rank
+    std::vector<std::vector<double>> node_pos_to_send(world_size);
+
+    // create a 2D vector to hold the node positions on each rank
+    std::vector<std::vector<double>> node_pos_on_rank(world_size);
+
+
+
+
+    if (rank == 0) {
+
+        num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
+
+        // Compute elements to send to each rank; handle remainders for non-even distribution
+
+        // Compute elements to send to each rank; handle remainders for non-even distribution
+        std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size);
+        int remainder = initial_mesh.num_elems % world_size;
+        for (int i = 0; i < remainder; ++i) {
+            elems_per_rank[i] += 1;
+        }
+    }
+
+    MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
+    MPI_Barrier(MPI_COMM_WORLD);
+
+// ********************************************************  
+//        Scatter the number of elements to each rank
+// ******************************************************** 
+    // All ranks participate in the scatter operation
+    // MPI_Scatter signature:
+    // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+    //             void *recvbuf, int recvcount, MPI_Datatype recvtype,
+    //             int root, MPI_Comm comm)
+    double t_scatter_start = MPI_Wtime();
+    MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
+                &num_elements_on_rank, 1, MPI_INT, 
+                0, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Resize the elements_on_rank vector to hold the received data
+    elements_on_rank.resize(num_elements_on_rank);
+    
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_scatter_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the number of elements to each rank"<<std::endl;
+        std::cout << " Scatter operation took " << (t_scatter_end - t_scatter_start) << " seconds." << std::endl;
+    }
+
+
+    // ********************************************************  
+    //     Scatter the actual element global ids to each rank
+    // ******************************************************** 
+    double t_scatter_gids_start = MPI_Wtime();
+
+    if (rank == 0) {
+
+        //print elements per rank
+        std::cout<<std::endl;
+        int elem_gid = 0;
+        for (int i = 0; i < world_size; i++) {
+
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                elements_to_send[i].push_back(elem_gid);
+                elem_gid++;
+            }
+        }
+
+        if (print_info) {
+            for (int i = 0; i < world_size; i++) {
+                std::cout<<std::endl;
+                std::cout<<"Rank "<<i<<" will get "<<elems_per_rank[i]<<" elements: ";
+                for (int j = 0; j < elems_per_rank[i]; j++) {
+                    std::cout<<elements_to_send[i][j]<<" ";
+                }
+            }
+            std::cout<<std::endl;
+        }
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D elements_to_send into a 1D array
+        std::vector<int> all_elements;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = elems_per_rank[i];
+            displs[i] = displacement;
+            // Copy elements for rank i to the flattened array
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                all_elements.push_back(elements_to_send[i][j]);
+            }
+            displacement += elems_per_rank[i];
+        }
+
+        // Send the elements to each rank
+        MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT,
+                    elements_on_rank.data(), num_elements_on_rank, MPI_INT,
+                    0, MPI_COMM_WORLD);
+    } 
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                    elements_on_rank.data(), num_elements_on_rank, MPI_INT,
+                    0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_scatter_gids_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the actual element global ids to each rank"<<std::endl;
+        std::cout << " Scattering the actual element global ids to each rank took " 
+                << (t_scatter_gids_end - t_scatter_gids_start) << " seconds." << std::endl;
+    }
+
+
+    if (print_info) {
+        std::cout << "Rank " << rank << " received elements: ";
+        for (int i = 0; i < num_elements_on_rank; i++) {
+            std::cout << elements_on_rank[i] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+
+// ****************************************************************************************** 
+//     Scatter the number of nodes to each rank and compute which nodes to send to each rank
+// ****************************************************************************************** 
+
+    // Timer: Start measuring time for node scattering
+    double t_scatter_nodes_start = MPI_Wtime();
+
+    if (rank == 0) {
+
+        // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
+        for (int i = 0; i < world_size; i++) {      
+            std::set<int> nodes_set;
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                for (int k = 0; k < num_nodes_per_elem; k++) {
+                    nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
+                }
+            }
+            nodes_to_send[i] = std::vector<int>(nodes_set.begin(), nodes_set.end());
+        } 
+
+        for (int i = 0; i < world_size; i++) {
+            nodes_per_rank[i] = nodes_to_send[i].size();
+        }
+
+        if (print_info) {
+
+            
+            std::cout<<std::endl;
+            // print the nodes_to_send array
+            for (int i = 0; i < world_size; i++) {
+
+                std::cout<<std::endl;
+                std::cout<<"Rank "<<i<<" will get "<<nodes_to_send[i].size()<<" nodes: ";
+
+                for (int j = 0; j < nodes_to_send[i].size(); j++) {
+                    std::cout<<nodes_to_send[i][j]<<" ";
+                }
+                std::cout<<std::endl;
+            }
+        }
+    }
+
+    // Send the number of nodes to each rank using MPI_scatter
+    MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
+
+    // resize the nodes_on_rank vector to hold the received data
+    nodes_on_rank.resize(num_nodes_on_rank);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (print_info) {
+        std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Timer: End measuring time for node scattering
+    double t_scatter_nodes_end = MPI_Wtime();
+
+    if(rank == 0) {
+        std::cout<<" Finished scattering the number of nodes to each rank"<<std::endl;
+        std::cout << " Scattering the number of nodes to each rank took " 
+                  << (t_scatter_nodes_end - t_scatter_nodes_start) << " seconds." << std::endl;
+    }
+
+
+
+// ****************************************************************************************** 
+//     Scatter the actual node global ids to each rank
+// ****************************************************************************************** 
+    // Timer: Start measuring time for scattering node global ids
+    double t_scatter_nodeids_start = MPI_Wtime();
+
+    if (rank == 0) {
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D nodes_to_send into a 1D array
+        std::vector<int> all_nodes;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = nodes_to_send[i].size();
+            displs[i] = displacement;
+            // Copy nodes for rank i to the flattened array
+            for (int j = 0; j < nodes_to_send[i].size(); j++) {
+                all_nodes.push_back(nodes_to_send[i][j]);
+            }
+            displacement += nodes_to_send[i].size();
+        }
+        // Send the nodes to each rank
+        // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank
+        // sendcounts.data(): Array with the number of nodes to send to each rank
+        // displs.data(): Array with the displacement for each rank in the flattened array
+        // MPI_INT: Data type of the nodes (integer)
+        // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes
+        // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive
+        // MPI_INT: Data type of the receive buffer (integer)
+        // 0: The root rank (rank 0) that is performing the scatter
+        // MPI_COMM_WORLD: The communicator
+        MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+            nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
+            0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Timer: End measuring time for scattering node global ids
+    double t_scatter_nodeids_end = MPI_Wtime();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) {
+        std::cout<<" Finished scattering the actual node global ids to each rank"<<std::endl;
+        std::cout << " Scattering node global ids took "
+                  << (t_scatter_nodeids_end - t_scatter_nodeids_start) << " seconds." << std::endl;
+    }
+
+
+// ****************************************************************************************** 
+//     Scatter the node positions to each rank
+// ****************************************************************************************** 
+    // Create a flat 1D vector for node positions (3 coordinates per node)
+    std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
+
+    // Timer for scattering node positions
+    double t_scatter_nodepos_start = MPI_Wtime();
+
+    if(rank == 0)
+    {
+        for (int i = 0; i < world_size; i++) {
+            for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++)
+            {
+                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0));
+                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1));
+                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2));
+            }
+        }
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D node_pos_to_send into a 1D array
+        std::vector<double> all_node_pos;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = nodes_to_send[i].size() * 3;
+            displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array
+            // Copy node positions for rank i to the flattened array
+            for(int j = 0; j < nodes_to_send[i].size(); j++) {
+                for(int k = 0; k < 3; k++) {
+                    all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]);
+                }
+            }
+            displacement += nodes_to_send[i].size() * 3;
+        }   
+
+        // Send the node positions to each rank
+        MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE,
+                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE,
+                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0 && print_info) {
+        // Print out the node positions on this rank
+        std::cout << "Rank " << rank << " received node positions: ";
+        for (int i = 0; i < num_nodes_on_rank; i++) {
+            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
+                      << node_pos_on_rank_flat[i*3+1] << ", " 
+                      << node_pos_on_rank_flat[i*3+2] << ") ";
+        }
+        std::cout << std::endl;
+    }
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 1 && print_info) {
+        // Print out the node positions on this rank
+        std::cout << "Rank " << rank << " received node positions: ";
+        for (int i = 0; i < num_nodes_on_rank; i++) {
+            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
+                      << node_pos_on_rank_flat[i*3+1] << ", " 
+                      << node_pos_on_rank_flat[i*3+2] << ") ";
+        }
+        std::cout << std::endl;
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_scatter_nodepos_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the node positions to each rank"<<std::endl;
+        std::cout << " Scattering node positions took "
+                  << (t_scatter_nodepos_end - t_scatter_nodepos_start) << " seconds." << std::endl;
+    }
+
+// ****************************************************************************************** 
+//     Initialize the node state variables
+// ****************************************************************************************** 
+
+    // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
+    std::vector<node_state> required_node_state = { node_state::coords };
+    naive_node.initialize(num_nodes_on_rank, 3, required_node_state);
+
+    for(int i = 0; i < num_nodes_on_rank; i++) {
+        naive_node.coords.host(i, 0) = node_pos_on_rank_flat[i*3];
+        naive_node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1];
+        naive_node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2];
+    }
+
+    naive_node.coords.update_device();
+
+
+// ****************************************************************************************** 
+//     Send the element-node connectivity data from the initial mesh to each rank
+// ****************************************************************************************** 
+
+    // Send the element-node connectivity data from the initial mesh to each rank
+    std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
+    
+    double t_scatter_elemnode_start = MPI_Wtime();
+
+    if (rank == 0) {
+        // Prepare element-node connectivity data for each rank
+        std::vector<int> all_nodes_in_elem;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for(int i = 0; i < world_size; i++) {
+            int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element
+            sendcounts[i] = num_connectivity_entries;
+            displs[i] = displacement;
+            
+            // Copy element-node connectivity for rank i
+            for(int j = 0; j < elements_to_send[i].size(); j++) {
+                for(int k = 0; k < num_nodes_per_elem; k++) {
+                    all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
+                }
+            }
+            displacement += num_connectivity_entries;
+        }
+        // Send the connectivity data to each rank
+        MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_scatter_elemnode_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl;
+        std::cout << " Scattering element-node connectivity took "
+                  << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl;
+    }
+
+    if (rank == 0 && print_info) {
+
+        std::cout << "Rank " << rank << " received element-node connectivity (" 
+                << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " nodes: ";
+            for (int node = 0; node < num_nodes_per_elem; node++) {
+                int idx = elem * num_nodes_per_elem + node;
+                std::cout << nodes_in_elem_on_rank[idx] << " ";
+            }
+            std::cout << std::endl;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"<<std::endl;
+
+
+
+
+// ****************************************************************************************** 
+//     Send the element-element connectivity data from the initial mesh to each rank
+// ****************************************************************************************** 
+
+    // First, rank 0 computes how many connectivity entries each rank will receive
+    // and scatters that information
+    std::vector<int> elem_elem_counts(world_size);
+    int total_elem_elem_entries = 0;
+    
+    
+    double t_scatter_elem_elem_start = MPI_Wtime();
+
+    if (rank == 0){
+        // Calculate total number of connectivity entries for each rank
+        for(int i = 0; i < world_size; i++) {
+            elem_elem_counts[i] = 0;
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]);
+            }
+
+            if(print_info) std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl;
+        }
+
+        // Print element-element connectivity entries for each rank in the initial mesh
+        if(print_info) {
+            for(int i = 0; i < world_size; i++) {
+                std::cout << std::endl;
+                std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "<<std::endl;
+                for(int k = 0; k < elements_to_send[i].size(); k++) {
+                    std::cout << "Element " << elements_to_send[i][k] << " has " << initial_mesh.num_elems_in_elem(elements_to_send[i][k]) << " element-element connectivity entries: ";
+                    for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
+                        std::cout << initial_mesh.elems_in_elem(elements_to_send[i][k], l) << " ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout << std::endl;
+            }
+        }
+    }
+    
+
+    // Define total_elem_elem_entries to be the sum of the elem_elem_counts
+    // Scatter the counts to each rank
+    MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT,
+                &total_elem_elem_entries, 1, MPI_INT,
+                0, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_scatter_elem_elem_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished scattering the number of element-element connectivity entries to each rank"<<std::endl;
+        std::cout<<" Scattering element-element connectivity counts took "
+                 << (t_scatter_elem_elem_end - t_scatter_elem_elem_start) << " seconds." << std::endl;
+    }
+
+    std::vector<int> elems_in_elem_on_rank(total_elem_elem_entries);
+    
+    // Now scatter the num_elems_in_elem for each element on each rank
+    std::vector<int> num_elems_in_elem_per_rank(num_elements_on_rank);
+    
+    if (rank == 0) {
+        std::vector<int> all_num_elems_in_elem;
+        std::vector<int> displs_ee(world_size);
+        int displacement = 0;
+        
+        for(int i = 0; i < world_size; i++) {
+            displs_ee[i] = displacement;
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k]));
+            }
+            displacement += elements_to_send[i].size();
+        }
+        
+        MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT,
+                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    } else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"<<std::endl;
+
+    if (rank == 0){
+        // Prepare the element-element connectivity data for each rank
+        std::vector<int> all_elems_in_elem;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        
+        for(int i = 0; i < world_size; i++) {
+            sendcounts[i] = elem_elem_counts[i];
+            displs[i] = displacement;
+            
+            // Copy element-element connectivity for rank i
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
+                    all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l));
+                }
+            }
+            displacement += elem_elem_counts[i];
+        }
+
+        // Send the element-element connectivity data to each rank using MPI_Scatterv
+        MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"<<std::endl;
+
+    if (rank == 0 && print_info) {
+        std::cout << "Rank " << rank << " received element-element connectivity (" 
+                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
+        
+        int offset = 0;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " has neighbors: ";
+            int num_neighbors = num_elems_in_elem_per_rank[elem];
+            for (int j = 0; j < num_neighbors; j++) {
+                std::cout << elems_in_elem_on_rank[offset + j] << " ";
+            }
+            offset += num_neighbors;
+            std::cout << std::endl;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 1 && print_info) {
+        std::cout << "Rank " << rank << " received element-element connectivity (" 
+                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
+        
+        int offset = 0;
+        for (int elem = 0; elem < num_elements_on_rank; elem++) {
+            std::cout << "  Element " << elem << " has neighbors: ";
+            int num_neighbors = num_elems_in_elem_per_rank[elem];
+            for (int j = 0; j < num_neighbors; j++) {
+                std::cout << elems_in_elem_on_rank[offset + j] << " ";
+            }
+            offset += num_neighbors;
+            std::cout << std::endl;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+// ****************************************************************************************** 
+//     Initialize the naive_mesh data structures for each rank
+// ****************************************************************************************** 
+    naive_mesh.initialize_nodes(num_nodes_on_rank);
+    naive_mesh.initialize_elems(num_elements_on_rank, 3);
+
+    naive_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_nodes_on_rank, "naive_mesh.local_to_global_node_mapping");
+    naive_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_elements_on_rank, "naive_mesh.local_to_global_elem_mapping");
+
+    for(int i = 0; i < num_nodes_on_rank; i++) {
+        naive_mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i];
+    }   
+
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        naive_mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i];
+    }
+
+    naive_mesh.local_to_global_node_mapping.update_device();
+    naive_mesh.local_to_global_elem_mapping.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    // Timer for reverse mapping of element-node connectivity
+    double t_reverse_map_start = MPI_Wtime();
+
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        for(int j = 0; j < num_nodes_per_elem; j++) {
+
+            int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j];
+
+            int node_lid = -1;
+
+            // Use binary search to find the local node index for node_gid
+            int left = 0, right = num_nodes_on_rank - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = naive_mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
+                    break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
+                }
+            }
+
+            naive_mesh.nodes_in_elem.host(i, j) = node_lid;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_reverse_map_end = MPI_Wtime();
+    if(rank == 0) {
+        std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+        std::cout<<" Reverse mapping time: " << (t_reverse_map_end - t_reverse_map_start) << " seconds." << std::endl;
+    }
+
+    naive_mesh.nodes_in_elem.update_device();
+
+    // ****************************************************************************************** 
+    //     Build the connectivity for the local naive_mesh
+    // ****************************************************************************************** 
+
+    naive_mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+    if (print_vtk) {
+        write_vtk(naive_mesh, naive_node, rank);
+    }
+
+
+
+// ****************************************************************************************** 
+//     Compute a repartition of the mesh using pt-scotch
+// ****************************************************************************************** 
+
+
+
+    /**********************************************************************************
+     * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
+     **********************************************************************************
+     *
+     * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch
+     * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges
+     * correspond to mesh-neighbor relationships (i.e., elements that share a face or are
+     * otherwise neighbors per your mesh definition).
+     *
+     * We use the compact CSR (Compressed Sparse Row) representation, passing only the
+     * essential information required by PT-Scotch.
+     * 
+     * Variables and structures used:
+     *   - SCOTCH_Dgraph dgraph:
+     *       The distributed graph instance managed by PT-Scotch. Each MPI rank creates
+     *       and fills in its portion of the global graph.
+     * 
+     *   - const SCOTCH_Num baseval:
+     *       The base value for vertex and edge numbering. Set to 0 for C-style zero-based
+     *       arrays. Always use 0 unless you are using Fortran style 1-based arrays.
+     * 
+     *   - const SCOTCH_Num vertlocnbr:
+     *       The *number of local vertices* (mesh elements) defined on this MPI rank.
+     *       In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify
+     *       its own local vertex count.
+     *
+     *   - const SCOTCH_Num vertlocmax:
+     *       The *maximum number of local vertices* that could be stored (capacity). We
+     *       allocate with no unused holes, so vertlocmax = vertlocnbr.
+     *
+     *   - std::vector<SCOTCH_Num> vertloctab:
+     *       CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i]
+     *       gives the index in edgeloctab where the neighbor list of vertex i begins.
+     *       PT-Scotch expects this array to be of size vertlocnbr+1, where the difference
+     *       vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i.
+     *
+     *   - std::vector<SCOTCH_Num> edgeloctab:
+     *       CSR array [variable size]: a flattened list of *neighboring element global IDs*,
+     *       in no particular order. For vertex i, its neighbors are located at
+     *       edgeloctab[vertloctab[i]...vertloctab[i+1]-1].
+     *       In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to
+     *       recognize edges both within and across ranks.
+     *
+     *   - std::map<int, size_t> elem_gid_to_offset:
+     *       Helper map: For a given element global ID, gives the starting offset in 
+     *       the flattened neighbor array (elems_in_elem_on_rank) where this element's
+     *       list of neighbor GIDs begins. This allows efficient neighbor list lookup.
+     *
+     *   - (other arrays used, from mesh setup and communication phase)
+     *       - elements_on_rank: vector of global element IDs owned by this rank.
+     *       - num_elements_on_rank: number of owned elements.
+     *       - num_elems_in_elem_per_rank: array, for each owned element, how many
+     *         neighbors it has.
+     *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
+     *
+     **********************************************************************************/
+
+    // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
+    SCOTCH_Dgraph dgraph;
+    if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    // Set base value for numbering (0 for C-style arrays)
+    const SCOTCH_Num baseval = 0;
+
+    // vertlocnbr: Number of elements (vertices) that are local to this MPI rank
+    const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(naive_mesh.num_elems);
+
+    // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr)
+    const SCOTCH_Num vertlocmax = vertlocnbr;
+
+    // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) ---
+    // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins
+    std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
+
+    // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
+    std::vector<SCOTCH_Num> edgeloctab;
+    edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance
+
+    // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
+    // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
+    std::map<int, size_t> elem_gid_to_offset;
+    size_t current_offset = 0;
+    for (size_t k = 0; k < num_elements_on_rank; k++) {
+        elem_gid_to_offset[elements_on_rank[k]] = current_offset;
+        current_offset += num_elems_in_elem_per_rank[k];
+    }
+
+    // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
+    SCOTCH_Num offset = 0; // running count of edges encountered
+
+    for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) {
+
+        // Record current edge offset for vertex lid in vertloctab
+        vertloctab[lid] = offset;
+
+        // Obtain this local element's global ID (from mapping)
+        int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+
+        // Find offset in the flattened neighbor array for this element's neighbor list
+        size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
+
+        // For this element, find the count of its neighbors
+        // This requires finding its index in the elements_on_rank array
+        size_t idx = 0;
+        for (size_t k = 0; k < num_elements_on_rank; k++) {
+            if (elements_on_rank[k] == elem_gid) {
+                idx = k;
+                break;
+            }
+        }
+        size_t num_nbrs = num_elems_in_elem_per_rank[idx];
+
+        // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
+        for (size_t j = 0; j < num_nbrs; j++) {
+            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
+            edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
+            ++offset; // Increment running edge count
+        }
+    }
+
+    // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure
+    vertloctab[vertlocnbr] = offset;
+
+    // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally
+    // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint)
+    const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees)
+    const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints
+
+    // Optionally print graph structure for debugging/validation
+    if (print_info) {
+        std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr
+                  << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl;
+        std::cout << "vertloctab (CSR row offsets): ";
+        for (size_t i = 0; i <= vertlocnbr; i++) {
+            std::cout << vertloctab[i] << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "edgeloctab (first 20 neighbor GIDs): ";
+        for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
+            std::cout << edgeloctab[i] << " ";
+        }
+        std::cout << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /**************************************************************************
+     * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild
+     *
+     *   - PT-Scotch will use our CSR arrays. Since we use compact representation,
+     *     most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab")
+     *     can be passed as nullptr.
+     *   - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this
+     *     to discover connections across processor boundaries, so you do not have to
+     *     encode ownership or partition information yourself.
+     **************************************************************************/
+    int rc = SCOTCH_dgraphBuild(
+                &dgraph,
+                baseval,                // start index (0)
+                vertlocnbr,             // local vertex count (local elements)
+                vertlocmax,             // local vertex max (no holes)
+                vertloctab.data(),      // row offsets in edgeloctab
+                /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr)
+                /*veloloctab*/ nullptr, // vertex weights, not used
+                /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab)
+                edgelocnbr,             // local edge endpoints count
+                edgelocsiz,             // size of edge array
+                edgeloctab.data(),      // global neighbor IDs for each local node
+                /*edgegsttab*/ nullptr, // ghost edge array, not used
+                /*edloloctab*/ nullptr  // edge weights, not used
+    );
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    // Optionally, print rank summary after graph build for further validation
+    if (print_info) {
+        SCOTCH_Num vertlocnbr_out;
+        SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
+        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<<std::endl;
+
+    /********************************************************
+     * Step 5: Validate the graph using SCOTCH_dgraphCheck
+     ********************************************************/
+    rc = SCOTCH_dgraphCheck(&dgraph);
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    /**************************************************************
+     * Step 6: Partition (repartition) the mesh using PT-Scotch
+     * - Each vertex (mesh element) will be assigned a part (mesh chunk).
+     * - Arch is initialized for a complete graph of world_size parts (one per rank).
+     **************************************************************/
+    // SCOTCH_Arch controls the "architecture" for partitioning: the topology
+    // (number and connectivity of parts) to which the graph will be mapped.
+    // The archdat variable encodes this. Below are common options:
+    //
+    // - SCOTCH_archCmplt(&archdat, nbparts)
+    //     * Creates a "complete graph" architecture with nbparts nodes (fully connected).
+    //       Every part is equally distant from every other part.
+    //       This is typically used when minimizing only *balance* and *edge cut*,
+    //       not considering any underlying machine topology.
+    //
+    // - SCOTCH_archHcub(&archdat, dimension)
+    //     * Hypercube architecture (rare in modern use).
+    //       Sets up a hypercube of given dimension.
+    //
+    // - SCOTCH_archTleaf / SCOTCH_archTleafX
+    //     * Tree architectures, for hierarchically structured architectures.
+    //
+    // - SCOTCH_archMesh2 / SCOTCH_archMesh3
+    //     * 2D or 3D mesh topology architectures (useful for grid/matrix machines).
+    //
+    // - SCOTCH_archBuild
+    //     * General: builds any architecture from a descriptor string.
+    //
+    // For distributed mesh partitioning to MPI ranks (where all ranks are equal),
+    // the most common and appropriate is "complete graph" (Cmplt): each part (rank)
+    // is equally reachable from any other (no communication topology bias).
+    SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
+    SCOTCH_archInit(&archdat);
+    // Partition into 'world_size' equally connected parts (each MPI rank is a "node")
+    // Other topology options could be substituted above according to your needs (see docs).
+    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
+
+
+
+    
+    // ===================== PT-Scotch Strategy Selection and Documentation ======================
+    // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
+    // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
+    //
+    // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation):
+    //
+    // - SCOTCH_STRATDEFAULT:     Use the default (fast, reasonable quality) partitioning strategy.
+    //                            Useful for quick, generic partitions where quality is not critical.
+    //
+    // - SCOTCH_STRATSPEED:       Aggressively maximizes speed (at the cost of cut quality).
+    //                            For large runs or test runs where speed is more important than minimizing edgecut.
+    //
+    // - SCOTCH_STRATQUALITY:     Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance).
+    //                            Slower than the default. Use when high-quality partitioning is desired.
+    //
+    // - SCOTCH_STRATBALANCE:     Tradeoff between speed and quality for balanced workload across partitions.
+    //                            Use if load balance is more critical than cut size.
+    //
+    // Additional Options:
+    // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}").
+    // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic).
+    // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality.
+    //
+    // Example usage:
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01);
+    //      ^ quality-focused, nparts=number of parts/ranks
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05);
+    //      ^ speed-focused, allow 5% imbalance
+    //
+    // Reference:
+    // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf
+    // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation.
+    //
+    // --------------- Set up the desired partitioning strategy here: ---------------
+    SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
+    SCOTCH_stratInit(&stratdat);
+
+    // Select partitioning strategy for this run:
+    // Use SCOTCH_STRATQUALITY for best cut quality.
+    // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
+    // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
+    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01);
+
+    // partloctab: output array mapping each local element (vertex) to a *target partition number*
+    // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
+    std::vector<SCOTCH_Num> partloctab(vertlocnbr);
+    rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n";
+        SCOTCH_stratExit(&stratdat);
+        SCOTCH_archExit(&archdat);
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    // Clean up PT-Scotch strategy and architecture objects
+    SCOTCH_stratExit(&stratdat);
+    SCOTCH_archExit(&archdat);
+    
+    // Free the graph now that we have the partition assignments
+    SCOTCH_dgraphFree(&dgraph);
+
+    /***************************************************************************
+     * Step 7 (Optional): Print out the partitioning assignment per element
+     * - Each local element's local index lid and global ID (gid) are listed with the
+     *   part to which PT-Scotch has assigned them.
+     ***************************************************************************/
+    print_info = false;
+    for(int rank_id = 0; rank_id < world_size; rank_id++) {
+        if(rank_id == rank && print_info) {
+            for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) {
+                size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+                std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
+                        << " -> part=" << partloctab[lid] << "\n";
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    print_info = false;
+
+
+
+
+// ****************************************************************************************** 
+//     Build the final mesh from the repartition
+// ****************************************************************************************** 
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n";
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // -------------- Phase 1: Determine elements to send to each rank --------------
+    std::vector<std::vector<int>> elems_to_send(world_size);
+    for (int lid = 0; lid < naive_mesh.num_elems; ++lid) {
+        int dest = static_cast<int>(partloctab[lid]);
+        int elem_gid = static_cast<int>(naive_mesh.local_to_global_elem_mapping.host(lid));
+        elems_to_send[dest].push_back(elem_gid);
+    }
+
+    // -------------- Phase 2: Exchange element GIDs --------------
+    std::vector<int> sendcounts(world_size), recvcounts(world_size);
+    for (int r = 0; r < world_size; ++r)
+        sendcounts[r] = static_cast<int>(elems_to_send[r].size());
+
+    MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Compute displacements
+    std::vector<int> sdispls(world_size), rdispls(world_size);
+    int send_total = 0, recv_total = 0;
+    for (int r = 0; r < world_size; ++r) {
+        sdispls[r] = send_total;
+        rdispls[r] = recv_total;
+        send_total += sendcounts[r];
+        recv_total += recvcounts[r];
+    }
+
+
+    // Flatten send buffer
+    std::vector<int> sendbuf;
+    sendbuf.reserve(send_total);
+    for (int r = 0; r < world_size; ++r)
+        sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end());
+
+    // Receive new local element GIDs
+    std::vector<int> recvbuf(recv_total);
+    MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT,
+                recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element GIDs"<<std::endl;
+
+    // New elements owned by this rank
+    std::vector<int> new_elem_gids = recvbuf;
+    int num_new_elems = static_cast<int>(new_elem_gids.size());
+    
+    
+    if (print_info) {
+        std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
+    }
+
+    // -------------- Phase 3: Send element–node connectivity --------------
+    int nodes_per_elem = naive_mesh.num_nodes_in_elem;
+
+    // Flatten element-node connectivity by global node IDs
+    std::vector<int> conn_sendbuf;
+    for (int r = 0; r < world_size; ++r) {
+        for (int gid : elems_to_send[r]) {
+            // find local element lid from gid
+            int lid = -1;
+            for (int i = 0; i < naive_mesh.num_elems; ++i)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+
+            for (int j = 0; j < nodes_per_elem; j++) {
+                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
+                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+                conn_sendbuf.push_back(node_gid);
+            }
+        }
+    }
+
+    // element-node connectivity counts (ints per dest rank)
+    std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
+    for (int r = 0; r < world_size; ++r)
+        conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
+
+    MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"<<std::endl;
+
+    std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
+    int conn_send_total = 0, conn_recv_total = 0;
+    for (int r = 0; r < world_size; ++r) {
+        conn_sdispls[r] = conn_send_total;
+        conn_rdispls[r] = conn_recv_total;
+        conn_send_total += conn_sendcounts[r];
+        conn_recv_total += conn_recvcounts[r];
+    }
+
+    std::vector<int> conn_recvbuf(conn_recv_total);
+    MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT,
+                conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"<<std::endl;
+
+    // -------------- Phase 4: Build new node list (unique GIDs) --------------
+    std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
+    std::vector<int> new_node_gids(node_gid_set.begin(), node_gid_set.end());
+    int num_new_nodes = static_cast<int>(new_node_gids.size());
+
+    // Build map gid→lid
+    std::unordered_map<int,int> node_gid_to_lid;
+    for (int i = 0; i < num_new_nodes; ++i)
+        node_gid_to_lid[new_node_gids[i]] = i;
+
+    if (print_info)
+        std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n";
+
+
+    // -------------- Phase 5: Request node coordinates --------------
+    std::vector<double> node_coords_sendbuf;
+    for (int r = 0; r < world_size; ++r) {
+        for (int gid : elems_to_send[r]) {
+            int lid = -1;
+            for (int i = 0; i < naive_mesh.num_elems; ++i)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+
+            for (int j = 0; j < nodes_per_elem; j++) {
+                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
+                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+
+                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0));
+                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1));
+                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2));
+            }
+        }
+    }
+
+    // Each node is 3 doubles; same sendcounts scaling applies
+    std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
+    for (int r = 0; r < world_size; ++r)
+        coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
+
+    MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<<std::endl;
+
+    std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
+    int coord_send_total = 0, coord_recv_total = 0;
+    for (int r = 0; r < world_size; ++r) {
+        coord_sdispls[r] = coord_send_total;
+        coord_rdispls[r] = coord_recv_total;
+        coord_send_total += coord_sendcounts[r];
+        coord_recv_total += coord_recvcounts[r];
+    }
+
+    std::vector<double> coord_recvbuf(coord_recv_total);
+    MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE,
+                coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
+
+    // -------------- Phase 6: Build the intermediate_mesh --------------
+    intermediate_mesh.initialize_nodes(num_new_nodes);
+    intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
+
+    // Fill global mappings
+    for (int i = 0; i < num_new_nodes; ++i)
+        intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
+    for (int i = 0; i < num_new_elems; ++i)
+        intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
+
+    intermediate_mesh.local_to_global_node_mapping.update_device();
+    intermediate_mesh.local_to_global_elem_mapping.update_device();
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < num_new_elems; i++) {
+        for(int j = 0; j < nodes_per_elem; j++) {
+
+            int node_gid = conn_recvbuf[i * nodes_per_elem + j];
+
+            int node_lid = -1;
+
+            // Binary search through local_to_global_node_mapping to find the equivalent local index
+            int left = 0, right = num_new_nodes - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
+                    break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
+                }
+            }
+
+            intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    intermediate_mesh.nodes_in_elem.update_device();
+
+    // Fill node coordinates
+    // coord_recvbuf contains coords in element-node order, but we need them in node order
+    // Build a map from node GID to coordinates
+    std::map<int, std::array<double, 3>> node_gid_to_coords;
+    int coord_idx = 0;
+    for (int e = 0; e < num_new_elems; ++e) {
+        for (int j = 0; j < nodes_per_elem; j++) {
+            int node_gid = conn_recvbuf[e * nodes_per_elem + j];
+            if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
+                node_gid_to_coords[node_gid] = {
+                    coord_recvbuf[coord_idx*3 + 0],
+                    coord_recvbuf[coord_idx*3 + 1],
+                    coord_recvbuf[coord_idx*3 + 2]
+                };
+            }
+            coord_idx++;
+        }
+    }
+    
+    // Now fill coordinates in node order
+    intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
+    for (int i = 0; i < num_new_nodes; ++i) {
+        int node_gid = new_node_gids[i];
+        auto it = node_gid_to_coords.find(node_gid);
+        if (it != node_gid_to_coords.end()) {
+            intermediate_node.coords.host(i, 0) = it->second[0];
+            intermediate_node.coords.host(i, 1) = it->second[1];
+            intermediate_node.coords.host(i, 2) = it->second[2];
+        }
+    }
+    intermediate_node.coords.update_device();
+
+    // Connectivity rebuild
+    intermediate_mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+// ****************************************************************************************** 
+//     Build the ghost elements and nodes
+// ****************************************************************************************** 
+
+    double t_ghost_start = MPI_Wtime();
+    
+    // First, gather the number of elements each rank owns
+    std::vector<int> elem_counts(world_size);
+
+    // int MPI_Allgather(
+    //     const void* sendbuf,      // Data to send from this process
+    //     int sendcount,            // Number of elements to send
+    //     MPI_Datatype sendtype,    // Type of send data
+    //     void* recvbuf,            // Buffer to receive all data
+    //     int recvcount,            // Number of elements to receive from each process
+    //     MPI_Datatype recvtype,    // Type of receive data
+    //     MPI_Comm comm             // Communicator
+    // );
+    MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Compute displacements
+    std::vector<int> elem_displs(world_size);
+    int total_elems = 0;
+    for (int r = 0; r < world_size; ++r) {
+        elem_displs[r] = total_elems;
+        total_elems += elem_counts[r];
+    }
+    
+    // Gather all element GIDs from all ranks
+    std::vector<size_t> all_elem_gids(total_elems);
+
+    // int MPI_Allgatherv(
+    //     const void* sendbuf,      // Data to send from this process
+    //     int sendcount,            // Number of elements THIS process sends
+    //     MPI_Datatype sendtype,    // Type of send data
+    //     void* recvbuf,            // Buffer to receive all data
+    //     const int* recvcounts,    // Array: number of elements from each process
+    //     const int* displs,        // Array: displacement for each process's data
+    //     MPI_Datatype recvtype,    // Type of receive data
+    //     MPI_Comm comm             // Communicator
+    // );
+    MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
+                   all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Build a map: element GID -> owning rank
+    std::map<size_t, int> elem_gid_to_rank;
+    for (int r = 0; r < world_size; ++r) {
+        for (int i = 0; i < elem_counts[r]; ++i) {
+            size_t gid = all_elem_gids[elem_displs[r] + i];
+            elem_gid_to_rank[gid] = r;
+        }
+    }
+    
+    // Strategy: Find ghost elements by checking neighbors of our boundary elements.
+    // A boundary element is one that has a neighbor owned by another rank.
+    // However, since build_connectivity() only includes locally-owned elements,
+    // we need to use a different approach: find elements on other ranks that share
+    // nodes with our locally-owned elements.
+    
+    // First, collect all nodes that belong to our locally-owned elements
+    std::set<size_t> local_elem_nodes;
+    for (int lid = 0; lid < num_new_elems; ++lid) {
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+            local_elem_nodes.insert(node_gid);
+        }
+    }
+    
+    // Now collect element-to-node connectivity to send to all ranks
+    // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid)
+    std::vector<size_t> elem_node_conn;
+    int local_conn_size = 0;
+    
+    for (int lid = 0; lid < num_new_elems; ++lid) {
+        size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid);
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+            elem_node_conn.push_back(elem_gid);
+            elem_node_conn.push_back(node_gid);
+        }
+        local_conn_size += nodes_per_elem * 2;  // Each pair is 2 size_ts
+    }
+    
+    // Exchange element-node connectivity with all ranks using Allgather
+    // First, gather the sizes from each rank
+    std::vector<int> conn_sizes(world_size);
+    MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Compute displacements
+    std::vector<int> conn_displs(world_size);
+    int total_conn = 0;
+    for (int r = 0; r < world_size; ++r) {
+        conn_displs[r] = total_conn;
+        total_conn += conn_sizes[r];
+    }
+    
+    // Gather all element-node pairs from all ranks
+    std::vector<size_t> all_conn(total_conn);
+    MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
+                   all_conn.data(), conn_sizes.data(), conn_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    // create a set for local_elem_gids
+    std::set<size_t> local_elem_gids;
+    for (int i = 0; i < num_new_elems; ++i) {
+        local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i));
+    }
+    
+    // Build a map: node GID -> set of element GIDs that contain it (from other ranks)
+    std::map<size_t, std::set<size_t>> node_to_ext_elem;
+    for (int r = 0; r < world_size; ++r) {
+        if (r == rank) continue;  // Skip our own data
+        // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        for (int i = 0; i < num_pairs; ++i) {
+            // Each pair is 2 size_ts, starting at conn_displs[r]
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this node is in one of our elements, then the element is a potential ghost
+            if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
+                // Check if this element is not owned by us
+                if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
+                    node_to_ext_elem[node_gid].insert(elem_gid);
+                }
+            }
+        }
+    }
+    
+    // Collect all unique ghost element GIDs
+    std::set<size_t> ghost_elem_gids;
+    for (const auto& pair : node_to_ext_elem) {
+        for (size_t elem_gid : pair.second) {
+            ghost_elem_gids.insert(elem_gid);
+        }
+    }
+    
+    // Additional check: elements that are neighbors of our locally-owned elements
+    // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
+    
+    for (int lid = 0; lid < num_new_elems; ++lid) {
+        size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid);
+        
+        for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
+            size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx);
+            
+            if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
+                size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid);
+                
+                // Check if neighbor is owned by this rank
+                auto it = elem_gid_to_rank.find(neighbor_gid);
+                if (it != elem_gid_to_rank.end() && it->second != rank) {
+                    // Neighbor is owned by another rank - it's a ghost for us
+                    ghost_elem_gids.insert(neighbor_gid);
+                }
+            }
+        }
+    }
+    
+    // Count unique ghost elements
+    intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    double t_ghost_end = MPI_Wtime();
+    
+    if (rank == 0) {
+        std::cout << " Finished calculating ghost elements" << std::endl;
+        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
+    }
+    
+    // Print ghost element info if requested
+    print_info = false;
+    for(int i = 0; i < world_size; i++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(rank == i && print_info) {
+            std::cout << "[rank " << rank << "] owns " << num_new_elems 
+                  << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl;
+            std::cout << "[rank " << rank << "] owned element global IDs: ";
+            for (int j = 0; j < intermediate_mesh.num_elems; j++) {
+                std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " ";
+            }
+
+            // Print global IDs of ghost elements
+            std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: ";
+            for (const auto& gid : ghost_elem_gids) {
+                std::cout << gid << " ";
+            }
+            std::cout << std::endl;
+        }
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
+
+    // Build the connectivity that includes ghost elements
+    // Create an extended mesh with owned elements first, then ghost elements appended
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
+    
+    // Step 1: Extract ghost element-node connectivity from all_conn
+    // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn)
+    std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
+    for (const size_t& ghost_gid : ghost_elem_gids) {
+        ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem);
+    }
+    
+    // Extract nodes for each ghost element from all_conn
+    // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements
+    for (int r = 0; r < world_size; ++r) {
+        if (r == rank) continue;  // Skip our own data (we already have owned element connectivity)
+        int num_pairs = conn_sizes[r] / 2;
+        
+        // Process pairs in order - each element's nodes are contiguous
+        for (int i = 0; i < num_pairs; ++i) {
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this is one of our ghost elements, record its node (in order)
+            auto it = ghost_elem_to_nodes.find(elem_gid);
+            if (it != ghost_elem_to_nodes.end()) {
+                it->second.push_back(node_gid);
+            }
+        }
+    }
+    
+    // Verify each ghost element has the correct number of nodes
+    for (auto& pair : ghost_elem_to_nodes) {
+        if (pair.second.size() != static_cast<size_t>(nodes_per_elem)) {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
+                      << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl;
+        }
+    }
+    
+    // Step 2: Build extended node list (owned nodes first, then ghost-only nodes)
+    // Start with owned nodes
+    std::map<size_t, int> node_gid_to_extended_lid;
+    int extended_node_lid = 0;
+    
+    // Add all owned nodes
+    for (int i = 0; i < intermediate_mesh.num_nodes; ++i) {
+        size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i);
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
+    
+    // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements)
+    std::set<size_t> ghost_only_nodes;
+    for (const auto& pair : ghost_elem_to_nodes) {
+        for (size_t node_gid : pair.second) {
+            // Check if we already have this node
+            if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) {
+                ghost_only_nodes.insert(node_gid);
+            }
+        }
+    }
+    
+    // Assign extended local IDs to ghost-only nodes
+    for (size_t node_gid : ghost_only_nodes) {
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
+    
+    int total_extended_nodes = extended_node_lid;
+    
+    // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
+    // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
+    std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
+    for (size_t node_gid : ghost_only_nodes) {
+        // Find which rank owns an element containing this node
+        // Look through ghost elements
+        for (const auto& pair : ghost_elem_to_nodes) {
+            size_t ghost_elem_gid = pair.first;
+            const std::vector<size_t>& nodes = pair.second;
+            bool found = false;
+            for (size_t ngid : nodes) {
+                if (ngid == node_gid) {
+                    found = true;
+                    break;
+                }
+            }
+            if (found) {
+                auto owner_it = elem_gid_to_rank.find(ghost_elem_gid);
+                if (owner_it != elem_gid_to_rank.end()) {
+                    rank_to_ghost_node_requests[owner_it->second].push_back(node_gid);
+                    break;
+                }
+            }
+        }
+    }
+    
+    // Step 4: Build extended element list and node connectivity
+    // Owned elements: 0 to num_new_elems-1 (already have these)
+    // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
+    
+    // Create extended element-node connectivity array
+    int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems;
+    std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
+    
+    // Copy owned element connectivity (convert to extended node LIDs)
+    for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) {
+        extended_nodes_in_elem[lid].reserve(nodes_per_elem);
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[lid].push_back(ext_lid);
+        }
+    }
+    
+    // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
+    int ghost_elem_ext_lid = intermediate_mesh.num_elems;
+    std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
+    std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
+    
+    for (size_t ghost_gid : ghost_elem_gids_ordered) {
+        auto it = ghost_elem_to_nodes.find(ghost_gid);
+        if (it == ghost_elem_to_nodes.end()) continue;
+        
+        extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem);
+        for (size_t node_gid : it->second) {
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid);
+        }
+        ghost_elem_ext_lid++;
+    }
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Sequential rank-wise printing of extended mesh structure info
+    for (int r = 0; r < world_size; ++r) {
+        if (rank == r) {
+            std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
+            std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
+            std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
+            std::cout << "[rank " << rank << "]   - Owned nodes: " << intermediate_mesh.num_nodes << std::endl;
+            std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
+            std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
+            std::cout << std::flush;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+    // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
+    // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
+    
+    // Build reverse maps: extended_lid -> gid for nodes and elements
+    std::vector<size_t> extended_lid_to_node_gid(total_extended_nodes);
+    for (const auto& pair : node_gid_to_extended_lid) {
+        extended_lid_to_node_gid[pair.second] = pair.first;
+    }
+    
+    // Build extended element GID list: owned first, then ghost
+    std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
+    // Owned elements
+    for (int i = 0; i < intermediate_mesh.num_elems; ++i) {
+        extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i);
+    }
+    // Ghost elements (in sorted order)
+    for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) {
+        extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
+    }
+
+
+
+// ****************************************************************************************** 
+//     Build the final partitioned mesh
+// ****************************************************************************************** 
+
+
+
+
+    final_mesh.initialize_nodes(total_extended_nodes);
+    final_mesh.initialize_elems(total_extended_elems, 3);
+    final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
+    final_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
+    for (int i = 0; i < total_extended_nodes; i++) {
+        final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
+    }
+    for (int i = 0; i < total_extended_elems; i++) {
+        final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
+    }
+    final_mesh.local_to_global_node_mapping.update_device();
+    final_mesh.local_to_global_elem_mapping.update_device();
+
+    final_mesh.num_ghost_elems = ghost_elem_gids.size();
+    final_mesh.num_ghost_nodes = ghost_only_nodes.size();
+    
+    // Set owned counts for write_vtk (excludes ghost elements/nodes)
+    final_mesh.num_owned_elems = intermediate_mesh.num_elems;
+    final_mesh.num_owned_nodes = intermediate_mesh.num_nodes;
+
+
+    // Print num ghost elements and nodes on each rank sequentially
+    for (int r = 0; r < world_size; ++r) {
+        if (rank == r) {
+            std::cout << "*******[rank " << rank << "]   - Ghost elements: " << final_mesh.num_ghost_elems << std::endl;
+            std::cout << "*******[rank " << rank << "]   - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    // rebuild the local element-node connectivity using the local node ids
+    // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
+    for(int i = 0; i < total_extended_elems; i++) {
+        for(int j = 0; j < nodes_per_elem; j++) {
+            final_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+
+    final_mesh.nodes_in_elem.update_device();
+
+    final_mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+
+// ****************************************************************************************** 
+//     Build the final nodes that include ghost
+// ****************************************************************************************** 
+
+
+ final_node.initialize(total_extended_nodes, 3, {node_state::coords});
+    
+    // The goal here is to populate final_node.coords using globally gathered ghost node coordinates,
+    // since intermediate_node does not contain ghost node coordinates.
+    //
+    // Each rank will:
+    //  1. Gather coordinates of its owned nodes (from intermediate_node).
+    //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
+    //     into a structure mapping global ID -> coordinate.
+    //  3. Use this map to fill final_node.coords.
+
+    // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
+    std::vector<size_t> all_needed_node_gids(total_extended_nodes);
+    for (int i = 0; i < total_extended_nodes; ++i) {
+        all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
+    }
+
+    // 2. Build owned node GIDs and their coordinates
+    std::vector<size_t> owned_gids(intermediate_mesh.num_nodes);
+    for (int i = 0; i < owned_gids.size(); ++i)
+        owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i);
+
+     // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
+     //    so we can distribute the needed coordinate data.
+     // The easiest is to Allgather everyone's "owned_gids" and coords
+ 
+     int local_owned_count = static_cast<int>(owned_gids.size());
+     std::vector<int> owned_counts(world_size, 0);
+     if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1
+
+    // a) Gather counts
+    owned_counts.resize(world_size, 0);
+    MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // b) Displacements and total
+    std::vector<int> owned_displs(world_size,0);
+    int total_owned = 0;
+    for (int r=0; r<world_size; ++r) {
+        owned_displs[r] = total_owned;
+        total_owned += owned_counts[r];
+    }
+
+    // c) Global GIDs (size: total_owned)
+    std::vector<size_t> all_owned_gids(total_owned);
+    MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG,
+                   all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+
+    // d) Global coords (size: total_owned x 3)
+    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
+    for (int i=0; i<local_owned_count; ++i) {
+        owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0);
+        owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1);
+        owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2);
+    }
+    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
+
+    // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
+    std::vector<int> coord_counts(world_size);
+    std::vector<int> coord_displs(world_size);
+    for (int r=0; r<world_size; ++r) {
+        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
+        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
+    }
+
+    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
+                   all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
+                   MPI_DOUBLE, MPI_COMM_WORLD);
+
+    // e) Build map: gid -> coord[3]
+    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
+    for (int i=0; i<total_owned; ++i) {
+        std::array<double,3> xyz = {
+            all_owned_coords[3*i+0],
+            all_owned_coords[3*i+1],
+            all_owned_coords[3*i+2]
+        };
+         gid_to_coord[all_owned_gids[i]] = xyz;
+    }
+
+    // 4. Finally, fill final_node.coords with correct coordinates.
+    for (int i = 0; i < total_extended_nodes; ++i) {
+        size_t gid = final_mesh.local_to_global_node_mapping.host(i);
+        auto it = gid_to_coord.find(gid);
+        if (it != gid_to_coord.end()) {
+            final_node.coords.host(i,0) = it->second[0];
+            final_node.coords.host(i,1) = it->second[1];
+            final_node.coords.host(i,2) = it->second[2];
+        } else {
+            // Could happen if there's a bug: fill with zeros for safety
+            final_node.coords.host(i,0) = 0.0;
+            final_node.coords.host(i,1) = 0.0;
+            final_node.coords.host(i,2) = 0.0;
+        }
+    }
+    final_node.coords.update_device();
+
+
+    // --------------------------------------------------------------------------------------
+    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost element GIDs.
+    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(intermediate_mesh.num_elems);
+
+    // Prepare local ghost list as vector
+    std::vector<size_t> ghost_gids_vec;
+    ghost_gids_vec.reserve(ghost_elem_gids.size());
+    for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g);
+
+    // Exchange counts
+    std::vector<int> ghost_counts(world_size, 0);
+    int local_ghost_count = static_cast<int>(ghost_gids_vec.size());
+    MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // Displacements and recv buffer
+    std::vector<int> ghost_displs(world_size, 0);
+    int total_ghosts = 0;
+    for (int r = 0; r < world_size; ++r) {
+        ghost_displs[r] = total_ghosts;
+        total_ghosts += ghost_counts[r];
+    }
+    std::vector<size_t> all_ghost_gids(total_ghosts);
+
+    // Gather ghost gids
+    MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG,
+                   all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl;
+    
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl;
+    // Build map gid -> ranks that ghost it
+    std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
+    gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
+    for (int r = 0; r < world_size; ++r) {
+        int cnt = ghost_counts[r];
+        int off = ghost_displs[r];
+        for (int i = 0; i < cnt; ++i) {
+            size_t g = all_ghost_gids[off + i];
+            gid_to_ghosting_ranks[g].push_back(r);
+        }
+    }
+
+    // For each local element, list destinations: ranks that ghost our gid
+    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+        size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
+        auto it = gid_to_ghosting_ranks.find(local_elem_gid);
+        if (it == gid_to_ghosting_ranks.end()) continue;
+        const std::vector<int> &dest_ranks = it->second;
+        for (int rr : dest_ranks) {
+            if (rr == rank) continue;
+            boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid));
+        }
+    }
+
+    std::cout.flush();
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Optional: print a compact summary of reverse map for verification (limited output)
+    for(int i = 0; i < world_size; i++) {
+        if (rank == i && print_info) {
+            std::cout << std::endl;
+            for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+
+                size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
+                if (boundary_elem_targets[elem_lid].empty()) 
+                {
+                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl;
+                }
+                else
+                {
+                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: ";
+                    int shown = 0;
+                    for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                        if (shown >= 12) { std::cout << " ..."; break; }
+                        std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
+                        shown++;
+                    }
+                    std::cout << std::endl;
+                }
+            }
+            std::cout.flush();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    print_info = false;
+
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+
+
+}
+
+
+
+
+
+
+
+
+
+#endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 4cc6da9e..bc3e8371 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -1,59 +1,25 @@
-#include <iostream>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-#include <memory>
-#include <mpi.h>
-#include <set>
-#include <map>
+// #include <iostream>
+// #include <cstdlib>
+// #include <cstring>
+// #include <vector>
+// #include <memory>
+// #include <mpi.h>
+// #include <set>
+// #include <map>
 
 
-#include "mesh.h"
-#include "state.h"
-#include "mesh_io.h"
+// #include "mesh.h"
+// #include "state.h"
+// #include "mesh_io.h"
+
+#include "decomp_utils.h"
 
 // Include Scotch headers
 #include "scotch.h"
 #include "ptscotch.h"
 
-
-void calc_elements_per_rank(std::vector<int>& elems_per_rank, int num_elems, int world_size){
-    // Compute elements to send to each rank; handle remainders for non-even distribution
-    std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size);
-    int remainder = num_elems % world_size;
-    for (int i = 0; i < remainder; ++i) {
-        elems_per_rank[i] += 1;
-    }
-}
-
-void print_rank_mesh_info(Mesh_t& mesh, int rank) {
-
-    std::cout<<std::endl;
-    std::cout<<"Rank "<<rank<<" printing mesh info"<<std::endl;
-    std::cout<<"Mesh has "<<mesh.num_elems<<" elements"<<std::endl;
-    std::cout<<"Mesh has "<<mesh.num_nodes<<" nodes"<<std::endl;
-
-    for (int i = 0; i < mesh.num_elems; i++) {
-        std::cout<<"Element "<<i<<" has nodes global id: "<<mesh.local_to_global_elem_mapping.host(i)<<" and local nodes:";
-        for (int j = 0; j < mesh.num_nodes_in_elem; j++) {
-            std::cout<<mesh.nodes_in_elem.host(i, j)<<" ";
-        }
-        std::cout<<std::endl;
-        std::cout<<"Which have global indices of : ";
-        for (int k = 0; k < mesh.num_nodes_in_elem; k++) {
-            std::cout<<mesh.local_to_global_node_mapping.host(mesh.nodes_in_elem.host(i, k))<<" ";
-        }
-        std::cout<<std::endl;
-    }
-    std::cout<<std::endl;
-}
-
 int main(int argc, char** argv) {
 
-    bool print_info = false;
-    bool print_vtk = false;
-
-
     MPI_Init(&argc, &argv);
     MATAR_INITIALIZE(argc, argv);
     { // MATAR scope
@@ -65,58 +31,19 @@ int main(int argc, char** argv) {
 
     double t_main_start = MPI_Wtime();
 
-
-    // Initial mesh size
+    // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {25, 25, 25};
-
-
-
-
+    int num_elems_dim[3] = {100, 100, 100};
 
+    // Initial mesh built on rank zero
     Mesh_t initial_mesh;
-    GaussPoint_t initial_GaussPoints;
     node_t initial_node;
 
-    // Create mesh, gauss points, and node data structures on each rank
-    // This is the initial partitioned mesh
-    Mesh_t mesh;
-    node_t node;
-
-    // Mesh partitioned by pt-scotch, not including ghost
-    Mesh_t intermediate_mesh; 
-    node_t intermediate_node;
-
     // Mesh partitioned by pt-scotch, including ghost
     Mesh_t final_mesh;
     node_t final_node;
 
-    int num_elements_on_rank = 0;
-    int num_nodes_on_rank = 0;
-
-    int num_nodes_per_elem = 0;
-
-    std::vector<int> elements_on_rank;  
-    std::vector<int> nodes_on_rank;
-
-
-    std::vector<int> elems_per_rank(world_size); // number of elements to send to each rank size(world_size)
-    std::vector<int> nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size)
-
-    // create a 2D vector of elements to send to each rank
-    std::vector<std::vector<int>> elements_to_send(world_size);
-
-    // create a 2D vector of nodes to send to each rank
-    std::vector<std::vector<int>> nodes_to_send(world_size);
-
-    // Create a 2D vector to hold the nodal positions on each rank
-    std::vector<std::vector<double>> node_pos_to_send(world_size);
-
-    // create a 2D vector to hold the node positions on each rank
-    std::vector<std::vector<double>> node_pos_on_rank(world_size);
-
-
 // ********************************************************  
 //              Build the initial mesh
 // ********************************************************  
@@ -127,1889 +54,17 @@ int main(int argc, char** argv) {
         std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
     
         std::cout<<"Initializing mesh"<<std::endl;
-        build_3d_box(initial_mesh, initial_GaussPoints, initial_node, origin, length, num_elems_dim);
-
-        num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
-
-        // Compute elements to send to each rank; handle remainders for non-even distribution
-        calc_elements_per_rank(elems_per_rank, initial_mesh.num_elems, world_size);
+        build_3d_box(initial_mesh,  initial_node, origin, length, num_elems_dim);
     }
 
-    MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    double t_init_mesh_end = MPI_Wtime();
-    if (rank == 0) {
-        std::cout << "Initial mesh generation + broadcast took " << (t_init_mesh_end - t_init_mesh_start) << " seconds." << std::endl;
-    }
-
-    
 // ********************************************************  
-//        Scatter the number of elements to each rank
-// ******************************************************** 
-    // All ranks participate in the scatter operation
-    // MPI_Scatter signature:
-    // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-    //             void *recvbuf, int recvcount, MPI_Datatype recvtype,
-    //             int root, MPI_Comm comm)
-    double t_scatter_start = MPI_Wtime();
-    MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
-                &num_elements_on_rank, 1, MPI_INT, 
-                0, MPI_COMM_WORLD);
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Resize the elements_on_rank vector to hold the received data
-    elements_on_rank.resize(num_elements_on_rank);
-    
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    double t_scatter_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the number of elements to each rank"<<std::endl;
-        std::cout << " Scatter operation took " << (t_scatter_end - t_scatter_start) << " seconds." << std::endl;
-    }
-
+//             Partition and balance the mesh
 // ********************************************************  
-//     Scatter the actual element global ids to each rank
-// ******************************************************** 
-    double t_scatter_gids_start = MPI_Wtime();
-
-    if (rank == 0) {
-
-        //print elements per rank
-        std::cout<<std::endl;
-        int elem_gid = 0;
-        for (int i = 0; i < world_size; i++) {
-
-            for (int j = 0; j < elems_per_rank[i]; j++) {
-                elements_to_send[i].push_back(elem_gid);
-                elem_gid++;
-            }
-        }
-
-        if (print_info) {
-            for (int i = 0; i < world_size; i++) {
-                std::cout<<std::endl;
-                std::cout<<"Rank "<<i<<" will get "<<elems_per_rank[i]<<" elements: ";
-                for (int j = 0; j < elems_per_rank[i]; j++) {
-                    std::cout<<elements_to_send[i][j]<<" ";
-                }
-            }
-            std::cout<<std::endl;
-        }
-
-        // Prepare data for MPI_Scatterv (scatter with variable counts)
-        // Flatten the 2D elements_to_send into a 1D array
-        std::vector<int> all_elements;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
-        
-        int displacement = 0;
-        for (int i = 0; i < world_size; i++) {
-            sendcounts[i] = elems_per_rank[i];
-            displs[i] = displacement;
-            // Copy elements for rank i to the flattened array
-            for (int j = 0; j < elems_per_rank[i]; j++) {
-                all_elements.push_back(elements_to_send[i][j]);
-            }
-            displacement += elems_per_rank[i];
-        }
-
-        // Send the elements to each rank
-        MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     elements_on_rank.data(), num_elements_on_rank, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    } 
-    else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     elements_on_rank.data(), num_elements_on_rank, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    double t_scatter_gids_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the actual element global ids to each rank"<<std::endl;
-        std::cout << " Scattering the actual element global ids to each rank took " 
-                  << (t_scatter_gids_end - t_scatter_gids_start) << " seconds." << std::endl;
-    }
-    
-
-    if (print_info) {
-        std::cout << "Rank " << rank << " received elements: ";
-        for (int i = 0; i < num_elements_on_rank; i++) {
-            std::cout << elements_on_rank[i] << " ";
-        }
-        std::cout << std::endl;
-    }
-    
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-
-
-// ****************************************************************************************** 
-//     Scatter the number of nodes to each rank and compute which nodes to send to each rank
-// ****************************************************************************************** 
-
-    // Timer: Start measuring time for node scattering
-    double t_scatter_nodes_start = MPI_Wtime();
-
-    if (rank == 0) {
-
-        // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
-        for (int i = 0; i < world_size; i++) {      
-            std::set<int> nodes_set;
-            for (int j = 0; j < elems_per_rank[i]; j++) {
-                for (int k = 0; k < num_nodes_per_elem; k++) {
-                    nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
-                }
-            }
-            nodes_to_send[i] = std::vector<int>(nodes_set.begin(), nodes_set.end());
-        } 
-
-        for (int i = 0; i < world_size; i++) {
-            nodes_per_rank[i] = nodes_to_send[i].size();
-        }
-
-        if (print_info) {
-
-            
-            std::cout<<std::endl;
-            // print the nodes_to_send array
-            for (int i = 0; i < world_size; i++) {
-
-                std::cout<<std::endl;
-                std::cout<<"Rank "<<i<<" will get "<<nodes_to_send[i].size()<<" nodes: ";
-
-                for (int j = 0; j < nodes_to_send[i].size(); j++) {
-                    std::cout<<nodes_to_send[i][j]<<" ";
-                }
-                std::cout<<std::endl;
-            }
-        }
-    }
-
-    // Send the number of nodes to each rank using MPI_scatter
-    MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
-
-    // resize the nodes_on_rank vector to hold the received data
-    nodes_on_rank.resize(num_nodes_on_rank);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (print_info) {
-        std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Timer: End measuring time for node scattering
-    double t_scatter_nodes_end = MPI_Wtime();
-
-    if(rank == 0) {
-        std::cout<<" Finished scattering the number of nodes to each rank"<<std::endl;
-        std::cout << " Scattering the number of nodes to each rank took " 
-                  << (t_scatter_nodes_end - t_scatter_nodes_start) << " seconds." << std::endl;
-    }
-
-
-// ****************************************************************************************** 
-//     Scatter the actual node global ids to each rank
-// ****************************************************************************************** 
-    // Timer: Start measuring time for scattering node global ids
-    double t_scatter_nodeids_start = MPI_Wtime();
-
-    if (rank == 0) {
-
-        // Prepare data for MPI_Scatterv (scatter with variable counts)
-        // Flatten the 2D nodes_to_send into a 1D array
-        std::vector<int> all_nodes;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
-        
-        int displacement = 0;
-        for (int i = 0; i < world_size; i++) {
-            sendcounts[i] = nodes_to_send[i].size();
-            displs[i] = displacement;
-            // Copy nodes for rank i to the flattened array
-            for (int j = 0; j < nodes_to_send[i].size(); j++) {
-                all_nodes.push_back(nodes_to_send[i][j]);
-            }
-            displacement += nodes_to_send[i].size();
-        }
-        // Send the nodes to each rank
-        // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank
-        // sendcounts.data(): Array with the number of nodes to send to each rank
-        // displs.data(): Array with the displacement for each rank in the flattened array
-        // MPI_INT: Data type of the nodes (integer)
-        // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes
-        // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive
-        // MPI_INT: Data type of the receive buffer (integer)
-        // 0: The root rank (rank 0) that is performing the scatter
-        // MPI_COMM_WORLD: The communicator
-        MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-    else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-            nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
-            0, MPI_COMM_WORLD);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Timer: End measuring time for scattering node global ids
-    double t_scatter_nodeids_end = MPI_Wtime();
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) {
-        std::cout<<" Finished scattering the actual node global ids to each rank"<<std::endl;
-        std::cout << " Scattering node global ids took "
-                  << (t_scatter_nodeids_end - t_scatter_nodeids_start) << " seconds." << std::endl;
-    }
-
-// ****************************************************************************************** 
-//     Scatter the node positions to each rank
-// ****************************************************************************************** 
-    // Create a flat 1D vector for node positions (3 coordinates per node)
-    std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
-
-    // Timer for scattering node positions
-    double t_scatter_nodepos_start = MPI_Wtime();
-
-    if(rank == 0)
-    {
-        for (int i = 0; i < world_size; i++) {
-            for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++)
-            {
-                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0));
-                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1));
-                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2));
-            }
-        }
-
-        // Prepare data for MPI_Scatterv (scatter with variable counts)
-        // Flatten the 2D node_pos_to_send into a 1D array
-        std::vector<double> all_node_pos;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
-        
-        int displacement = 0;
-        for (int i = 0; i < world_size; i++) {
-            sendcounts[i] = nodes_to_send[i].size() * 3;
-            displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array
-            // Copy node positions for rank i to the flattened array
-            for(int j = 0; j < nodes_to_send[i].size(); j++) {
-                for(int k = 0; k < 3; k++) {
-                    all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]);
-                }
-            }
-            displacement += nodes_to_send[i].size() * 3;
-        }   
-
-        // Send the node positions to each rank
-        MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE,
-                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
-                     0, MPI_COMM_WORLD);
-    }
-    else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE,
-                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
-                     0, MPI_COMM_WORLD);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (rank == 0 && print_info) {
-        // Print out the node positions on this rank
-        std::cout << "Rank " << rank << " received node positions: ";
-        for (int i = 0; i < num_nodes_on_rank; i++) {
-            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
-                      << node_pos_on_rank_flat[i*3+1] << ", " 
-                      << node_pos_on_rank_flat[i*3+2] << ") ";
-        }
-        std::cout << std::endl;
-    }
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (rank == 1 && print_info) {
-        // Print out the node positions on this rank
-        std::cout << "Rank " << rank << " received node positions: ";
-        for (int i = 0; i < num_nodes_on_rank; i++) {
-            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
-                      << node_pos_on_rank_flat[i*3+1] << ", " 
-                      << node_pos_on_rank_flat[i*3+2] << ") ";
-        }
-        std::cout << std::endl;
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    double t_scatter_nodepos_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the node positions to each rank"<<std::endl;
-        std::cout << " Scattering node positions took "
-                  << (t_scatter_nodepos_end - t_scatter_nodepos_start) << " seconds." << std::endl;
-    }
-
-// ****************************************************************************************** 
-//     Initialize the node state variables
-// ****************************************************************************************** 
-
-    // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
-    std::vector<node_state> required_node_state = { node_state::coords };
-    node.initialize(num_nodes_on_rank, 3, required_node_state);
-
-    for(int i = 0; i < num_nodes_on_rank; i++) {
-        node.coords.host(i, 0) = node_pos_on_rank_flat[i*3];
-        node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1];
-        node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2];
-    }
-
-    node.coords.update_device();
-
-// ****************************************************************************************** 
-//     Send the element-node connectivity data from the initial mesh to each rank
-// ****************************************************************************************** 
-
-    // Send the element-node connectivity data from the initial mesh to each rank
-    std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
-    
-    double t_scatter_elemnode_start = MPI_Wtime();
-
-    if (rank == 0) {
-        // Prepare element-node connectivity data for each rank
-        std::vector<int> all_nodes_in_elem;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
-        
-        int displacement = 0;
-        for(int i = 0; i < world_size; i++) {
-            int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element
-            sendcounts[i] = num_connectivity_entries;
-            displs[i] = displacement;
-            
-            // Copy element-node connectivity for rank i
-            for(int j = 0; j < elements_to_send[i].size(); j++) {
-                for(int k = 0; k < num_nodes_per_elem; k++) {
-                    all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
-                }
-            }
-            displacement += num_connectivity_entries;
-        }
-        // Send the connectivity data to each rank
-        MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-    else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    double t_scatter_elemnode_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl;
-        std::cout << " Scattering element-node connectivity took "
-                  << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl;
-    }
-
-    if (rank == 0 && print_info) {
-
-        std::cout << "Rank " << rank << " received element-node connectivity (" 
-                << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " nodes: ";
-            for (int node = 0; node < num_nodes_per_elem; node++) {
-                int idx = elem * num_nodes_per_elem + node;
-                std::cout << nodes_in_elem_on_rank[idx] << " ";
-            }
-            std::cout << std::endl;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"<<std::endl;
-
-
-// ****************************************************************************************** 
-//     Send the element-element connectivity data from the initial mesh to each rank
-// ****************************************************************************************** 
-
-    // First, rank 0 computes how many connectivity entries each rank will receive
-    // and scatters that information
-    std::vector<int> elem_elem_counts(world_size);
-    int total_elem_elem_entries = 0;
-    
-    
-    double t_scatter_elem_elem_start = MPI_Wtime();
-
-    if (rank == 0){
-        // Calculate total number of connectivity entries for each rank
-        for(int i = 0; i < world_size; i++) {
-            elem_elem_counts[i] = 0;
-            for(int k = 0; k < elements_to_send[i].size(); k++) {
-                elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]);
-            }
-
-            if(print_info) std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl;
-        }
-
-        // Print element-element connectivity entries for each rank in the initial mesh
-        if(print_info) {
-            for(int i = 0; i < world_size; i++) {
-                std::cout << std::endl;
-                std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "<<std::endl;
-                for(int k = 0; k < elements_to_send[i].size(); k++) {
-                    std::cout << "Element " << elements_to_send[i][k] << " has " << initial_mesh.num_elems_in_elem(elements_to_send[i][k]) << " element-element connectivity entries: ";
-                    for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
-                        std::cout << initial_mesh.elems_in_elem(elements_to_send[i][k], l) << " ";
-                    }
-                    std::cout << std::endl;
-                }
-                std::cout << std::endl;
-            }
-        }
-    }
-    
-
-    // Define total_elem_elem_entries to be the sum of the elem_elem_counts
-    // Scatter the counts to each rank
-    MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT,
-                &total_elem_elem_entries, 1, MPI_INT,
-                0, MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    double t_scatter_elem_elem_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the number of element-element connectivity entries to each rank"<<std::endl;
-        std::cout<<" Scattering element-element connectivity counts took "
-                 << (t_scatter_elem_elem_end - t_scatter_elem_elem_start) << " seconds." << std::endl;
-    }
-
-    std::vector<int> elems_in_elem_on_rank(total_elem_elem_entries);
-    
-    // Now scatter the num_elems_in_elem for each element on each rank
-    std::vector<int> num_elems_in_elem_per_rank(num_elements_on_rank);
-    
-    if (rank == 0) {
-        std::vector<int> all_num_elems_in_elem;
-        std::vector<int> displs_ee(world_size);
-        int displacement = 0;
-        
-        for(int i = 0; i < world_size; i++) {
-            displs_ee[i] = displacement;
-            for(int k = 0; k < elements_to_send[i].size(); k++) {
-                all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k]));
-            }
-            displacement += elements_to_send[i].size();
-        }
-        
-        MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT,
-                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    } else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"<<std::endl;
-
-    if (rank == 0){
-        // Prepare the element-element connectivity data for each rank
-        std::vector<int> all_elems_in_elem;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
-        
-        int displacement = 0;
-        
-        for(int i = 0; i < world_size; i++) {
-            sendcounts[i] = elem_elem_counts[i];
-            displs[i] = displacement;
-            
-            // Copy element-element connectivity for rank i
-            for(int k = 0; k < elements_to_send[i].size(); k++) {
-                for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
-                    all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l));
-                }
-            }
-            displacement += elem_elem_counts[i];
-        }
-
-        // Send the element-element connectivity data to each rank using MPI_Scatterv
-        MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-    else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
-                     0, MPI_COMM_WORLD);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"<<std::endl;
-
-    if (rank == 0 && print_info) {
-        std::cout << "Rank " << rank << " received element-element connectivity (" 
-                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
-        
-        int offset = 0;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " has neighbors: ";
-            int num_neighbors = num_elems_in_elem_per_rank[elem];
-            for (int j = 0; j < num_neighbors; j++) {
-                std::cout << elems_in_elem_on_rank[offset + j] << " ";
-            }
-            offset += num_neighbors;
-            std::cout << std::endl;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (rank == 1 && print_info) {
-        std::cout << "Rank " << rank << " received element-element connectivity (" 
-                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
-        
-        int offset = 0;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " has neighbors: ";
-            int num_neighbors = num_elems_in_elem_per_rank[elem];
-            for (int j = 0; j < num_neighbors; j++) {
-                std::cout << elems_in_elem_on_rank[offset + j] << " ";
-            }
-            offset += num_neighbors;
-            std::cout << std::endl;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-// ****************************************************************************************** 
-//     Initialize the mesh data structures for each rank
-// ****************************************************************************************** 
-    mesh.initialize_nodes(num_nodes_on_rank);
-    mesh.initialize_elems(num_elements_on_rank, 3);
-
-    mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_nodes_on_rank, "mesh.local_to_global_node_mapping");
-    mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_elements_on_rank, "mesh.local_to_global_elem_mapping");
-
-    for(int i = 0; i < num_nodes_on_rank; i++) {
-        mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i];
-    }   
-
-    for(int i = 0; i < num_elements_on_rank; i++) {
-        mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i];
-    }
-
-    mesh.local_to_global_node_mapping.update_device();
-    mesh.local_to_global_elem_mapping.update_device();
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-
-    // Timer for reverse mapping of element-node connectivity
-    double t_reverse_map_start = MPI_Wtime();
-
-    // rebuild the local element-node connectivity using the local node ids
-    for(int i = 0; i < num_elements_on_rank; i++) {
-        for(int j = 0; j < num_nodes_per_elem; j++) {
-
-            int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j];
-
-            int node_lid = -1;
-
-            // Use binary search to find the local node index for node_gid
-            int left = 0, right = num_nodes_on_rank - 1;
-            while (left <= right) {
-                int mid = left + (right - left) / 2;
-                size_t mid_gid = mesh.local_to_global_node_mapping.host(mid);
-                if (node_gid == mid_gid) {
-                    node_lid = mid;
-                    break;
-                } else if (node_gid < mid_gid) {
-                    right = mid - 1;
-                } else {
-                    left = mid + 1;
-                }
-            }
-
-            mesh.nodes_in_elem.host(i, j) = node_lid;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    double t_reverse_map_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-        std::cout<<" Reverse mapping time: " << (t_reverse_map_end - t_reverse_map_start) << " seconds." << std::endl;
-    }
-
-    mesh.nodes_in_elem.update_device();
-
-// ****************************************************************************************** 
-//     Build the connectivity for the local mesh
-// ****************************************************************************************** 
-
-    mesh.build_connectivity();
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-    if(rank == 0 && print_info) {
-        print_rank_mesh_info(mesh, rank);
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if(rank == 1 && print_info) {
-        print_rank_mesh_info(mesh, rank);
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (print_vtk) {
-        write_vtk(mesh, node, rank);
-    }
-
-
-// ****************************************************************************************** 
-//     Compute a repartition of the mesh using pt-scotch
-// ****************************************************************************************** 
-
-
-
-    /**********************************************************************************
-     * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
-     **********************************************************************************
-     *
-     * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch
-     * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges
-     * correspond to mesh-neighbor relationships (i.e., elements that share a face or are
-     * otherwise neighbors per your mesh definition).
-     *
-     * We use the compact CSR (Compressed Sparse Row) representation, passing only the
-     * essential information required by PT-Scotch.
-     * 
-     * Variables and structures used:
-     *   - SCOTCH_Dgraph dgraph:
-     *       The distributed graph instance managed by PT-Scotch. Each MPI rank creates
-     *       and fills in its portion of the global graph.
-     * 
-     *   - const SCOTCH_Num baseval:
-     *       The base value for vertex and edge numbering. Set to 0 for C-style zero-based
-     *       arrays. Always use 0 unless you are using Fortran style 1-based arrays.
-     * 
-     *   - const SCOTCH_Num vertlocnbr:
-     *       The *number of local vertices* (mesh elements) defined on this MPI rank.
-     *       In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify
-     *       its own local vertex count.
-     *
-     *   - const SCOTCH_Num vertlocmax:
-     *       The *maximum number of local vertices* that could be stored (capacity). We
-     *       allocate with no unused holes, so vertlocmax = vertlocnbr.
-     *
-     *   - std::vector<SCOTCH_Num> vertloctab:
-     *       CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i]
-     *       gives the index in edgeloctab where the neighbor list of vertex i begins.
-     *       PT-Scotch expects this array to be of size vertlocnbr+1, where the difference
-     *       vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i.
-     *
-     *   - std::vector<SCOTCH_Num> edgeloctab:
-     *       CSR array [variable size]: a flattened list of *neighboring element global IDs*,
-     *       in no particular order. For vertex i, its neighbors are located at
-     *       edgeloctab[vertloctab[i]...vertloctab[i+1]-1].
-     *       In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to
-     *       recognize edges both within and across ranks.
-     *
-     *   - std::map<int, size_t> elem_gid_to_offset:
-     *       Helper map: For a given element global ID, gives the starting offset in 
-     *       the flattened neighbor array (elems_in_elem_on_rank) where this element's
-     *       list of neighbor GIDs begins. This allows efficient neighbor list lookup.
-     *
-     *   - (other arrays used, from mesh setup and communication phase)
-     *       - elements_on_rank: vector of global element IDs owned by this rank.
-     *       - num_elements_on_rank: number of owned elements.
-     *       - num_elems_in_elem_per_rank: array, for each owned element, how many
-     *         neighbors it has.
-     *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
-     *
-     **********************************************************************************/
-
-    // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
-    SCOTCH_Dgraph dgraph;
-    if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    // Set base value for numbering (0 for C-style arrays)
-    const SCOTCH_Num baseval = 0;
-
-    // vertlocnbr: Number of elements (vertices) that are local to this MPI rank
-    const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(mesh.num_elems);
-
-    // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr)
-    const SCOTCH_Num vertlocmax = vertlocnbr;
-
-    // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) ---
-    // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins
-    std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
-
-    // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
-    std::vector<SCOTCH_Num> edgeloctab;
-    edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance
-
-    // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
-    // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
-    std::map<int, size_t> elem_gid_to_offset;
-    size_t current_offset = 0;
-    for (size_t k = 0; k < num_elements_on_rank; k++) {
-        elem_gid_to_offset[elements_on_rank[k]] = current_offset;
-        current_offset += num_elems_in_elem_per_rank[k];
-    }
-
-    // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
-    SCOTCH_Num offset = 0; // running count of edges encountered
-
-    for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
-
-        // Record current edge offset for vertex lid in vertloctab
-        vertloctab[lid] = offset;
-
-        // Obtain this local element's global ID (from mapping)
-        int elem_gid = mesh.local_to_global_elem_mapping.host(lid);
-
-        // Find offset in the flattened neighbor array for this element's neighbor list
-        size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
-
-        // For this element, find the count of its neighbors
-        // This requires finding its index in the elements_on_rank array
-        size_t idx = 0;
-        for (size_t k = 0; k < num_elements_on_rank; k++) {
-            if (elements_on_rank[k] == elem_gid) {
-                idx = k;
-                break;
-            }
-        }
-        size_t num_nbrs = num_elems_in_elem_per_rank[idx];
-
-        // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
-        for (size_t j = 0; j < num_nbrs; j++) {
-            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
-            edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
-            ++offset; // Increment running edge count
-        }
-    }
-
-    // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure
-    vertloctab[vertlocnbr] = offset;
-
-    // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally
-    // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint)
-    const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees)
-    const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints
-
-    // Optionally print graph structure for debugging/validation
-    if (print_info) {
-        std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr
-                  << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl;
-        std::cout << "vertloctab (CSR row offsets): ";
-        for (size_t i = 0; i <= vertlocnbr; i++) {
-            std::cout << vertloctab[i] << " ";
-        }
-        std::cout << std::endl;
-        std::cout << "edgeloctab (first 20 neighbor GIDs): ";
-        for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
-            std::cout << edgeloctab[i] << " ";
-        }
-        std::cout << std::endl;
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    /**************************************************************************
-     * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild
-     *
-     *   - PT-Scotch will use our CSR arrays. Since we use compact representation,
-     *     most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab")
-     *     can be passed as nullptr.
-     *   - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this
-     *     to discover connections across processor boundaries, so you do not have to
-     *     encode ownership or partition information yourself.
-     **************************************************************************/
-    int rc = SCOTCH_dgraphBuild(
-                &dgraph,
-                baseval,                // start index (0)
-                vertlocnbr,             // local vertex count (local elements)
-                vertlocmax,             // local vertex max (no holes)
-                vertloctab.data(),      // row offsets in edgeloctab
-                /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr)
-                /*veloloctab*/ nullptr, // vertex weights, not used
-                /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab)
-                edgelocnbr,             // local edge endpoints count
-                edgelocsiz,             // size of edge array
-                edgeloctab.data(),      // global neighbor IDs for each local node
-                /*edgegsttab*/ nullptr, // ghost edge array, not used
-                /*edloloctab*/ nullptr  // edge weights, not used
-    );
-    if (rc != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
-        SCOTCH_dgraphFree(&dgraph);
-        MPI_Abort(MPI_COMM_WORLD, rc);
-    }
-
-    // Optionally, print rank summary after graph build for further validation
-    if (print_info) {
-        SCOTCH_Num vertlocnbr_out;
-        SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
-        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl;
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<<std::endl;
-
-    /********************************************************
-     * Step 5: Validate the graph using SCOTCH_dgraphCheck
-     ********************************************************/
-    rc = SCOTCH_dgraphCheck(&dgraph);
-    if (rc != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
-        SCOTCH_dgraphFree(&dgraph);
-        MPI_Abort(MPI_COMM_WORLD, rc);
-    }
-
-    /**************************************************************
-     * Step 6: Partition (repartition) the mesh using PT-Scotch
-     * - Each vertex (mesh element) will be assigned a part (mesh chunk).
-     * - Arch is initialized for a complete graph of world_size parts (one per rank).
-     **************************************************************/
-    // SCOTCH_Arch controls the "architecture" for partitioning: the topology
-    // (number and connectivity of parts) to which the graph will be mapped.
-    // The archdat variable encodes this. Below are common options:
-    //
-    // - SCOTCH_archCmplt(&archdat, nbparts)
-    //     * Creates a "complete graph" architecture with nbparts nodes (fully connected).
-    //       Every part is equally distant from every other part.
-    //       This is typically used when minimizing only *balance* and *edge cut*,
-    //       not considering any underlying machine topology.
-    //
-    // - SCOTCH_archHcub(&archdat, dimension)
-    //     * Hypercube architecture (rare in modern use).
-    //       Sets up a hypercube of given dimension.
-    //
-    // - SCOTCH_archTleaf / SCOTCH_archTleafX
-    //     * Tree architectures, for hierarchically structured architectures.
-    //
-    // - SCOTCH_archMesh2 / SCOTCH_archMesh3
-    //     * 2D or 3D mesh topology architectures (useful for grid/matrix machines).
-    //
-    // - SCOTCH_archBuild
-    //     * General: builds any architecture from a descriptor string.
-    //
-    // For distributed mesh partitioning to MPI ranks (where all ranks are equal),
-    // the most common and appropriate is "complete graph" (Cmplt): each part (rank)
-    // is equally reachable from any other (no communication topology bias).
-    SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
-    SCOTCH_archInit(&archdat);
-    // Partition into 'world_size' equally connected parts (each MPI rank is a "node")
-    // Other topology options could be substituted above according to your needs (see docs).
-    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
-
-
-
-    
-    // ===================== PT-Scotch Strategy Selection and Documentation ======================
-    // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
-    // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
-    //
-    // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation):
-    //
-    // - SCOTCH_STRATDEFAULT:     Use the default (fast, reasonable quality) partitioning strategy.
-    //                            Useful for quick, generic partitions where quality is not critical.
-    //
-    // - SCOTCH_STRATSPEED:       Aggressively maximizes speed (at the cost of cut quality).
-    //                            For large runs or test runs where speed is more important than minimizing edgecut.
-    //
-    // - SCOTCH_STRATQUALITY:     Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance).
-    //                            Slower than the default. Use when high-quality partitioning is desired.
-    //
-    // - SCOTCH_STRATBALANCE:     Tradeoff between speed and quality for balanced workload across partitions.
-    //                            Use if load balance is more critical than cut size.
-    //
-    // Additional Options:
-    // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}").
-    // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic).
-    // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality.
-    //
-    // Example usage:
-    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01);
-    //      ^ quality-focused, nparts=number of parts/ranks
-    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05);
-    //      ^ speed-focused, allow 5% imbalance
-    //
-    // Reference:
-    // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf
-    // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation.
-    //
-    // --------------- Set up the desired partitioning strategy here: ---------------
-    SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
-    SCOTCH_stratInit(&stratdat);
-
-    // Select partitioning strategy for this run:
-    // Use SCOTCH_STRATQUALITY for best cut quality.
-    // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
-    // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
-    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01);
-
-    // partloctab: output array mapping each local element (vertex) to a *target partition number*
-    // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
-    std::vector<SCOTCH_Num> partloctab(vertlocnbr);
-    rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
-    if (rc != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n";
-        SCOTCH_stratExit(&stratdat);
-        SCOTCH_archExit(&archdat);
-        SCOTCH_dgraphFree(&dgraph);
-        MPI_Abort(MPI_COMM_WORLD, rc);
-    }
-
-    // Clean up PT-Scotch strategy and architecture objects
-    SCOTCH_stratExit(&stratdat);
-    SCOTCH_archExit(&archdat);
-    
-    // Free the graph now that we have the partition assignments
-    SCOTCH_dgraphFree(&dgraph);
-
-    /***************************************************************************
-     * Step 7 (Optional): Print out the partitioning assignment per element
-     * - Each local element's local index lid and global ID (gid) are listed with the
-     *   part to which PT-Scotch has assigned them.
-     ***************************************************************************/
-    print_info = false;
-    for(int rank_id = 0; rank_id < world_size; rank_id++) {
-        if(rank_id == rank && print_info) {
-            for (size_t lid = 0; lid < mesh.num_elems; ++lid) {
-                size_t gid = mesh.local_to_global_elem_mapping.host(lid);
-                std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
-                        << " -> part=" << partloctab[lid] << "\n";
-            }
-            MPI_Barrier(MPI_COMM_WORLD);
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-    print_info = false;
-
-
-
-// ****************************************************************************************** 
-//     Build the final mesh from the repartition
-// ****************************************************************************************** 
-
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n";
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // -------------- Phase 1: Determine elements to send to each rank --------------
-    std::vector<std::vector<int>> elems_to_send(world_size);
-    for (int lid = 0; lid < mesh.num_elems; ++lid) {
-        int dest = static_cast<int>(partloctab[lid]);
-        int elem_gid = static_cast<int>(mesh.local_to_global_elem_mapping.host(lid));
-        elems_to_send[dest].push_back(elem_gid);
-    }
-
-    // -------------- Phase 2: Exchange element GIDs --------------
-    std::vector<int> sendcounts(world_size), recvcounts(world_size);
-    for (int r = 0; r < world_size; ++r)
-        sendcounts[r] = static_cast<int>(elems_to_send[r].size());
-
-    MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Compute displacements
-    std::vector<int> sdispls(world_size), rdispls(world_size);
-    int send_total = 0, recv_total = 0;
-    for (int r = 0; r < world_size; ++r) {
-        sdispls[r] = send_total;
-        rdispls[r] = recv_total;
-        send_total += sendcounts[r];
-        recv_total += recvcounts[r];
-    }
-
-
-    // Flatten send buffer
-    std::vector<int> sendbuf;
-    sendbuf.reserve(send_total);
-    for (int r = 0; r < world_size; ++r)
-        sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end());
-
-    // Receive new local element GIDs
-    std::vector<int> recvbuf(recv_total);
-    MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT,
-                recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging element GIDs"<<std::endl;
-
-    // New elements owned by this rank
-    std::vector<int> new_elem_gids = recvbuf;
-    int num_new_elems = static_cast<int>(new_elem_gids.size());
-    
-    
-    if (print_info) {
-        std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
-    }
-
-    // -------------- Phase 3: Send element–node connectivity --------------
-    int nodes_per_elem = mesh.num_nodes_in_elem;
-
-    // Flatten element-node connectivity by global node IDs
-    std::vector<int> conn_sendbuf;
-    for (int r = 0; r < world_size; ++r) {
-        for (int gid : elems_to_send[r]) {
-            // find local element lid from gid
-            int lid = -1;
-            for (int i = 0; i < mesh.num_elems; ++i)
-                if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
-
-            for (int j = 0; j < nodes_per_elem; j++) {
-                int node_lid = mesh.nodes_in_elem.host(lid, j);
-                int node_gid = mesh.local_to_global_node_mapping.host(node_lid);
-                conn_sendbuf.push_back(node_gid);
-            }
-        }
-    }
-
-    // element-node connectivity counts (ints per dest rank)
-    std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
-    for (int r = 0; r < world_size; ++r)
-        conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
-
-    MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"<<std::endl;
-
-    std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
-    int conn_send_total = 0, conn_recv_total = 0;
-    for (int r = 0; r < world_size; ++r) {
-        conn_sdispls[r] = conn_send_total;
-        conn_rdispls[r] = conn_recv_total;
-        conn_send_total += conn_sendcounts[r];
-        conn_recv_total += conn_recvcounts[r];
-    }
-
-    std::vector<int> conn_recvbuf(conn_recv_total);
-    MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT,
-                conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"<<std::endl;
-
-    // -------------- Phase 4: Build new node list (unique GIDs) --------------
-    std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
-    std::vector<int> new_node_gids(node_gid_set.begin(), node_gid_set.end());
-    int num_new_nodes = static_cast<int>(new_node_gids.size());
-
-    // Build map gid→lid
-    std::unordered_map<int,int> node_gid_to_lid;
-    for (int i = 0; i < num_new_nodes; ++i)
-        node_gid_to_lid[new_node_gids[i]] = i;
-
-    if (print_info)
-        std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n";
-
-
-    // -------------- Phase 5: Request node coordinates --------------
-    std::vector<double> node_coords_sendbuf;
-    for (int r = 0; r < world_size; ++r) {
-        for (int gid : elems_to_send[r]) {
-            int lid = -1;
-            for (int i = 0; i < mesh.num_elems; ++i)
-                if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
-
-            for (int j = 0; j < nodes_per_elem; j++) {
-                int node_lid = mesh.nodes_in_elem.host(lid, j);
-                int node_gid = mesh.local_to_global_node_mapping.host(node_lid);
-
-                node_coords_sendbuf.push_back(node.coords.host(node_lid, 0));
-                node_coords_sendbuf.push_back(node.coords.host(node_lid, 1));
-                node_coords_sendbuf.push_back(node.coords.host(node_lid, 2));
-            }
-        }
-    }
-
-    // Each node is 3 doubles; same sendcounts scaling applies
-    std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
-    for (int r = 0; r < world_size; ++r)
-        coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
-
-    MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<<std::endl;
-
-    std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
-    int coord_send_total = 0, coord_recv_total = 0;
-    for (int r = 0; r < world_size; ++r) {
-        coord_sdispls[r] = coord_send_total;
-        coord_rdispls[r] = coord_recv_total;
-        coord_send_total += coord_sendcounts[r];
-        coord_recv_total += coord_recvcounts[r];
-    }
-
-    std::vector<double> coord_recvbuf(coord_recv_total);
-    MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE,
-                coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
-
-    // -------------- Phase 6: Build the intermediate_mesh --------------
-    intermediate_mesh.initialize_nodes(num_new_nodes);
-    intermediate_mesh.initialize_elems(num_new_elems, mesh.num_dims);
-    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
-    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
-
-    // Fill global mappings
-    for (int i = 0; i < num_new_nodes; ++i)
-        intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
-    for (int i = 0; i < num_new_elems; ++i)
-        intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
-
-    intermediate_mesh.local_to_global_node_mapping.update_device();
-    intermediate_mesh.local_to_global_elem_mapping.update_device();
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-    // rebuild the local element-node connectivity using the local node ids
-    for(int i = 0; i < num_new_elems; i++) {
-        for(int j = 0; j < nodes_per_elem; j++) {
-
-            int node_gid = conn_recvbuf[i * nodes_per_elem + j];
-
-            int node_lid = -1;
-
-            // Binary search through local_to_global_node_mapping to find the equivalent local index
-            int left = 0, right = num_new_nodes - 1;
-            while (left <= right) {
-                int mid = left + (right - left) / 2;
-                size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid);
-                if (node_gid == mid_gid) {
-                    node_lid = mid;
-                    break;
-                } else if (node_gid < mid_gid) {
-                    right = mid - 1;
-                } else {
-                    left = mid + 1;
-                }
-            }
-
-            intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-
-    intermediate_mesh.nodes_in_elem.update_device();
-
-    // Fill node coordinates
-    // coord_recvbuf contains coords in element-node order, but we need them in node order
-    // Build a map from node GID to coordinates
-    std::map<int, std::array<double, 3>> node_gid_to_coords;
-    int coord_idx = 0;
-    for (int e = 0; e < num_new_elems; ++e) {
-        for (int j = 0; j < nodes_per_elem; j++) {
-            int node_gid = conn_recvbuf[e * nodes_per_elem + j];
-            if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
-                node_gid_to_coords[node_gid] = {
-                    coord_recvbuf[coord_idx*3 + 0],
-                    coord_recvbuf[coord_idx*3 + 1],
-                    coord_recvbuf[coord_idx*3 + 2]
-                };
-            }
-            coord_idx++;
-        }
-    }
-    
-    // Now fill coordinates in node order
-    intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
-    for (int i = 0; i < num_new_nodes; ++i) {
-        int node_gid = new_node_gids[i];
-        auto it = node_gid_to_coords.find(node_gid);
-        if (it != node_gid_to_coords.end()) {
-            intermediate_node.coords.host(i, 0) = it->second[0];
-            intermediate_node.coords.host(i, 1) = it->second[1];
-            intermediate_node.coords.host(i, 2) = it->second[2];
-        }
-    }
-    intermediate_node.coords.update_device();
-
-    // Connectivity rebuild
-    intermediate_mesh.build_connectivity();
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-
-// ****************************************************************************************** 
-//     Build the ghost elements
-// ****************************************************************************************** 
-
-    double t_ghost_start = MPI_Wtime();
-    
-    // First, gather the number of elements each rank owns
-    std::vector<int> elem_counts(world_size);
-
-    // int MPI_Allgather(
-    //     const void* sendbuf,      // Data to send from this process
-    //     int sendcount,            // Number of elements to send
-    //     MPI_Datatype sendtype,    // Type of send data
-    //     void* recvbuf,            // Buffer to receive all data
-    //     int recvcount,            // Number of elements to receive from each process
-    //     MPI_Datatype recvtype,    // Type of receive data
-    //     MPI_Comm comm             // Communicator
-    // );
-    MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Compute displacements
-    std::vector<int> elem_displs(world_size);
-    int total_elems = 0;
-    for (int r = 0; r < world_size; ++r) {
-        elem_displs[r] = total_elems;
-        total_elems += elem_counts[r];
-    }
-    
-    // Gather all element GIDs from all ranks
-    std::vector<size_t> all_elem_gids(total_elems);
-
-    // int MPI_Allgatherv(
-    //     const void* sendbuf,      // Data to send from this process
-    //     int sendcount,            // Number of elements THIS process sends
-    //     MPI_Datatype sendtype,    // Type of send data
-    //     void* recvbuf,            // Buffer to receive all data
-    //     const int* recvcounts,    // Array: number of elements from each process
-    //     const int* displs,        // Array: displacement for each process's data
-    //     MPI_Datatype recvtype,    // Type of receive data
-    //     MPI_Comm comm             // Communicator
-    // );
-    MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
-                   all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Build a map: element GID -> owning rank
-    std::map<size_t, int> elem_gid_to_rank;
-    for (int r = 0; r < world_size; ++r) {
-        for (int i = 0; i < elem_counts[r]; ++i) {
-            size_t gid = all_elem_gids[elem_displs[r] + i];
-            elem_gid_to_rank[gid] = r;
-        }
-    }
-    
-    // Strategy: Find ghost elements by checking neighbors of our boundary elements.
-    // A boundary element is one that has a neighbor owned by another rank.
-    // However, since build_connectivity() only includes locally-owned elements,
-    // we need to use a different approach: find elements on other ranks that share
-    // nodes with our locally-owned elements.
-    
-    // First, collect all nodes that belong to our locally-owned elements
-    std::set<size_t> local_elem_nodes;
-    for (int lid = 0; lid < num_new_elems; ++lid) {
-        for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-            local_elem_nodes.insert(node_gid);
-        }
-    }
-    
-    // Now collect element-to-node connectivity to send to all ranks
-    // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid)
-    std::vector<size_t> elem_node_conn;
-    int local_conn_size = 0;
-    
-    for (int lid = 0; lid < num_new_elems; ++lid) {
-        size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid);
-        for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-            elem_node_conn.push_back(elem_gid);
-            elem_node_conn.push_back(node_gid);
-        }
-        local_conn_size += nodes_per_elem * 2;  // Each pair is 2 size_ts
-    }
-    
-    // Exchange element-node connectivity with all ranks using Allgather
-    // First, gather the sizes from each rank
-    std::vector<int> conn_sizes(world_size);
-    MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Compute displacements
-    std::vector<int> conn_displs(world_size);
-    int total_conn = 0;
-    for (int r = 0; r < world_size; ++r) {
-        conn_displs[r] = total_conn;
-        total_conn += conn_sizes[r];
-    }
-    
-    // Gather all element-node pairs from all ranks
-    std::vector<size_t> all_conn(total_conn);
-    MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
-                   all_conn.data(), conn_sizes.data(), conn_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    // create a set for local_elem_gids
-    std::set<size_t> local_elem_gids;
-    for (int i = 0; i < num_new_elems; ++i) {
-        local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i));
-    }
-    
-    // Build a map: node GID -> set of element GIDs that contain it (from other ranks)
-    std::map<size_t, std::set<size_t>> node_to_ext_elem;
-    for (int r = 0; r < world_size; ++r) {
-        if (r == rank) continue;  // Skip our own data
-        // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2
-        int num_pairs = conn_sizes[r] / 2;
-        for (int i = 0; i < num_pairs; ++i) {
-            // Each pair is 2 size_ts, starting at conn_displs[r]
-            int offset = conn_displs[r] + i * 2;
-            size_t elem_gid = all_conn[offset];
-            size_t node_gid = all_conn[offset + 1];
-            
-            // If this node is in one of our elements, then the element is a potential ghost
-            if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
-                // Check if this element is not owned by us
-                if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
-                    node_to_ext_elem[node_gid].insert(elem_gid);
-                }
-            }
-        }
-    }
-    
-    // Collect all unique ghost element GIDs
-    std::set<size_t> ghost_elem_gids;
-    for (const auto& pair : node_to_ext_elem) {
-        for (size_t elem_gid : pair.second) {
-            ghost_elem_gids.insert(elem_gid);
-        }
-    }
-    
-    // Additional check: elements that are neighbors of our locally-owned elements
-    // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
-    
-    for (int lid = 0; lid < num_new_elems; ++lid) {
-        size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid);
-        
-        for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
-            size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx);
-            
-            if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
-                size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid);
-                
-                // Check if neighbor is owned by this rank
-                auto it = elem_gid_to_rank.find(neighbor_gid);
-                if (it != elem_gid_to_rank.end() && it->second != rank) {
-                    // Neighbor is owned by another rank - it's a ghost for us
-                    ghost_elem_gids.insert(neighbor_gid);
-                }
-            }
-        }
-    }
-    
-    // Count unique ghost elements
-    intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    double t_ghost_end = MPI_Wtime();
-    
-    if (rank == 0) {
-        std::cout << " Finished calculating ghost elements" << std::endl;
-        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
-    }
-    
-    // Print ghost element info if requested
-    print_info = false;
-    for(int i = 0; i < world_size; i++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if(rank == i && print_info) {
-            std::cout << "[rank " << rank << "] owns " << num_new_elems 
-                  << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl;
-            std::cout << "[rank " << rank << "] owned element global IDs: ";
-            for (int j = 0; j < intermediate_mesh.num_elems; j++) {
-                std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " ";
-            }
-
-            // Print global IDs of ghost elements
-            std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: ";
-            for (const auto& gid : ghost_elem_gids) {
-                std::cout << gid << " ";
-            }
-            std::cout << std::endl;
-        }
-        
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
-
-    // Build the connectivity that includes ghost elements
-    // Create an extended mesh with owned elements first, then ghost elements appended
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
-    
-    // Step 1: Extract ghost element-node connectivity from all_conn
-    // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn)
-    std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
-    for (const size_t& ghost_gid : ghost_elem_gids) {
-        ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem);
-    }
-    
-    // Extract nodes for each ghost element from all_conn
-    // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements
-    for (int r = 0; r < world_size; ++r) {
-        if (r == rank) continue;  // Skip our own data (we already have owned element connectivity)
-        int num_pairs = conn_sizes[r] / 2;
-        
-        // Process pairs in order - each element's nodes are contiguous
-        for (int i = 0; i < num_pairs; ++i) {
-            int offset = conn_displs[r] + i * 2;
-            size_t elem_gid = all_conn[offset];
-            size_t node_gid = all_conn[offset + 1];
-            
-            // If this is one of our ghost elements, record its node (in order)
-            auto it = ghost_elem_to_nodes.find(elem_gid);
-            if (it != ghost_elem_to_nodes.end()) {
-                it->second.push_back(node_gid);
-            }
-        }
-    }
-    
-    // Verify each ghost element has the correct number of nodes
-    for (auto& pair : ghost_elem_to_nodes) {
-        if (pair.second.size() != static_cast<size_t>(nodes_per_elem)) {
-            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
-                      << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl;
-        }
-    }
-    
-    // Step 2: Build extended node list (owned nodes first, then ghost-only nodes)
-    // Start with owned nodes
-    std::map<size_t, int> node_gid_to_extended_lid;
-    int extended_node_lid = 0;
-    
-    // Add all owned nodes
-    for (int i = 0; i < intermediate_mesh.num_nodes; ++i) {
-        size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i);
-        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
-    }
-    
-    // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements)
-    std::set<size_t> ghost_only_nodes;
-    for (const auto& pair : ghost_elem_to_nodes) {
-        for (size_t node_gid : pair.second) {
-            // Check if we already have this node
-            if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) {
-                ghost_only_nodes.insert(node_gid);
-            }
-        }
-    }
-    
-    // Assign extended local IDs to ghost-only nodes
-    for (size_t node_gid : ghost_only_nodes) {
-        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
-    }
-    
-    int total_extended_nodes = extended_node_lid;
-    
-    // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
-    // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
-    std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
-    for (size_t node_gid : ghost_only_nodes) {
-        // Find which rank owns an element containing this node
-        // Look through ghost elements
-        for (const auto& pair : ghost_elem_to_nodes) {
-            size_t ghost_elem_gid = pair.first;
-            const std::vector<size_t>& nodes = pair.second;
-            bool found = false;
-            for (size_t ngid : nodes) {
-                if (ngid == node_gid) {
-                    found = true;
-                    break;
-                }
-            }
-            if (found) {
-                auto owner_it = elem_gid_to_rank.find(ghost_elem_gid);
-                if (owner_it != elem_gid_to_rank.end()) {
-                    rank_to_ghost_node_requests[owner_it->second].push_back(node_gid);
-                    break;
-                }
-            }
-        }
-    }
-    
-    // Step 4: Build extended element list and node connectivity
-    // Owned elements: 0 to num_new_elems-1 (already have these)
-    // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
-    
-    // Create extended element-node connectivity array
-    int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems;
-    std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
-    
-    // Copy owned element connectivity (convert to extended node LIDs)
-    for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) {
-        extended_nodes_in_elem[lid].reserve(nodes_per_elem);
-        for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-            int ext_lid = node_gid_to_extended_lid[node_gid];
-            extended_nodes_in_elem[lid].push_back(ext_lid);
-        }
-    }
-    
-    // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
-    int ghost_elem_ext_lid = intermediate_mesh.num_elems;
-    std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
-    std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
-    
-    for (size_t ghost_gid : ghost_elem_gids_ordered) {
-        auto it = ghost_elem_to_nodes.find(ghost_gid);
-        if (it == ghost_elem_to_nodes.end()) continue;
-        
-        extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem);
-        for (size_t node_gid : it->second) {
-            int ext_lid = node_gid_to_extended_lid[node_gid];
-            extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid);
-        }
-        ghost_elem_ext_lid++;
-    }
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Sequential rank-wise printing of extended mesh structure info
-    for (int r = 0; r < world_size; ++r) {
-        if (rank == r) {
-            std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
-            std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
-            std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
-            std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
-            std::cout << "[rank " << rank << "]   - Owned nodes: " << intermediate_mesh.num_nodes << std::endl;
-            std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
-            std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
-            std::cout << std::flush;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-    
-    // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
-    // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
-    
-    // Build reverse maps: extended_lid -> gid for nodes and elements
-    std::vector<size_t> extended_lid_to_node_gid(total_extended_nodes);
-    for (const auto& pair : node_gid_to_extended_lid) {
-        extended_lid_to_node_gid[pair.second] = pair.first;
-    }
-    
-    // Build extended element GID list: owned first, then ghost
-    std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
-    // Owned elements
-    for (int i = 0; i < intermediate_mesh.num_elems; ++i) {
-        extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i);
-    }
-    // Ghost elements (in sorted order)
-    for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) {
-        extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
-    }
-    
-    final_mesh.initialize_nodes(total_extended_nodes);
-    final_mesh.initialize_elems(total_extended_elems, 3);
-    final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
-    final_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
-    for (int i = 0; i < total_extended_nodes; i++) {
-        final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
-    }
-    for (int i = 0; i < total_extended_elems; i++) {
-        final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
-    }
-    final_mesh.local_to_global_node_mapping.update_device();
-    final_mesh.local_to_global_elem_mapping.update_device();
-
-    final_mesh.num_ghost_elems = ghost_elem_gids.size();
-    final_mesh.num_ghost_nodes = ghost_only_nodes.size();
-    
-    // Set owned counts for write_vtk (excludes ghost elements/nodes)
-    final_mesh.num_owned_elems = intermediate_mesh.num_elems;
-    final_mesh.num_owned_nodes = intermediate_mesh.num_nodes;
-
-
-    // Print num ghost elements and nodes on each rank sequentially
-    for (int r = 0; r < world_size; ++r) {
-        if (rank == r) {
-            std::cout << "*******[rank " << rank << "]   - Ghost elements: " << final_mesh.num_ghost_elems << std::endl;
-            std::cout << "*******[rank " << rank << "]   - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    // rebuild the local element-node connectivity using the local node ids
-    // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
-    for(int i = 0; i < total_extended_elems; i++) {
-        for(int j = 0; j < nodes_per_elem; j++) {
-            final_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-
-    final_mesh.nodes_in_elem.update_device();
-
-    final_mesh.build_connectivity();
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-    
-    
-    
-    
-    if(rank == 0) std::cout << " Finished building extended mesh structure" << std::endl;
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    final_node.initialize(total_extended_nodes, 3, {node_state::coords});
-    
-    // The goal here is to populate final_node.coords using globally gathered ghost node coordinates,
-    // since intermediate_node does not contain ghost node coordinates.
-    //
-    // Each rank will:
-    //  1. Gather coordinates of its owned nodes (from intermediate_node).
-    //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
-    //     into a structure mapping global ID -> coordinate.
-    //  3. Use this map to fill final_node.coords.
-
-    // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
-    std::vector<size_t> all_needed_node_gids(total_extended_nodes);
-    for (int i = 0; i < total_extended_nodes; ++i) {
-        all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
-    }
-
-    // 2. Build owned node GIDs and their coordinates
-    std::vector<size_t> owned_gids(intermediate_mesh.num_nodes);
-    for (int i = 0; i < owned_gids.size(); ++i)
-        owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i);
-
-     // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
-     //    so we can distribute the needed coordinate data.
-     // The easiest is to Allgather everyone's "owned_gids" and coords
- 
-     int local_owned_count = static_cast<int>(owned_gids.size());
-     std::vector<int> owned_counts(world_size, 0);
-     if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1
-
-    // a) Gather counts
-    owned_counts.resize(world_size, 0);
-    MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-
-    // b) Displacements and total
-    std::vector<int> owned_displs(world_size,0);
-    int total_owned = 0;
-    for (int r=0; r<world_size; ++r) {
-        owned_displs[r] = total_owned;
-        total_owned += owned_counts[r];
-    }
-
-    // c) Global GIDs (size: total_owned)
-    std::vector<size_t> all_owned_gids(total_owned);
-    MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG,
-                   all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-
-    // d) Global coords (size: total_owned x 3)
-    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
-    for (int i=0; i<local_owned_count; ++i) {
-        owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0);
-        owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1);
-        owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2);
-    }
-    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
-
-    // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
-    std::vector<int> coord_counts(world_size);
-    std::vector<int> coord_displs(world_size);
-    for (int r=0; r<world_size; ++r) {
-        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
-        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
-    }
-
-    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
-                   all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
-                   MPI_DOUBLE, MPI_COMM_WORLD);
-
-    // e) Build map: gid -> coord[3]
-    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
-    for (int i=0; i<total_owned; ++i) {
-        std::array<double,3> xyz = {
-            all_owned_coords[3*i+0],
-            all_owned_coords[3*i+1],
-            all_owned_coords[3*i+2]
-        };
-         gid_to_coord[all_owned_gids[i]] = xyz;
-    }
-
-    // 4. Finally, fill final_node.coords with correct coordinates.
-    for (int i = 0; i < total_extended_nodes; ++i) {
-        size_t gid = final_mesh.local_to_global_node_mapping.host(i);
-        auto it = gid_to_coord.find(gid);
-        if (it != gid_to_coord.end()) {
-            final_node.coords.host(i,0) = it->second[0];
-            final_node.coords.host(i,1) = it->second[1];
-            final_node.coords.host(i,2) = it->second[2];
-        } else {
-            // Could happen if there's a bug: fill with zeros for safety
-            final_node.coords.host(i,0) = 0.0;
-            final_node.coords.host(i,1) = 0.0;
-            final_node.coords.host(i,2) = 0.0;
-        }
-    }
-    final_node.coords.update_device();
-
-
-
-
-    // --------------------------------------------------------------------------------------
-    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
-    // Steps:
-    // 1) Each rank contributes its ghost element GIDs.
-    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
-    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
-    // --------------------------------------------------------------------------------------
-    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(intermediate_mesh.num_elems);
-
-    // Prepare local ghost list as vector
-    std::vector<size_t> ghost_gids_vec;
-    ghost_gids_vec.reserve(ghost_elem_gids.size());
-    for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g);
-
-    // Exchange counts
-    std::vector<int> ghost_counts(world_size, 0);
-    int local_ghost_count = static_cast<int>(ghost_gids_vec.size());
-    MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-
-    // Displacements and recv buffer
-    std::vector<int> ghost_displs(world_size, 0);
-    int total_ghosts = 0;
-    for (int r = 0; r < world_size; ++r) {
-        ghost_displs[r] = total_ghosts;
-        total_ghosts += ghost_counts[r];
-    }
-    std::vector<size_t> all_ghost_gids(total_ghosts);
-
-    // Gather ghost gids
-    MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG,
-                   all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl;
-    
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl;
-    // Build map gid -> ranks that ghost it
-    std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
-    gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
-    for (int r = 0; r < world_size; ++r) {
-        int cnt = ghost_counts[r];
-        int off = ghost_displs[r];
-        for (int i = 0; i < cnt; ++i) {
-            size_t g = all_ghost_gids[off + i];
-            gid_to_ghosting_ranks[g].push_back(r);
-        }
-    }
-
-    // For each local element, list destinations: ranks that ghost our gid
-    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
-        size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
-        auto it = gid_to_ghosting_ranks.find(local_elem_gid);
-        if (it == gid_to_ghosting_ranks.end()) continue;
-        const std::vector<int> &dest_ranks = it->second;
-        for (int rr : dest_ranks) {
-            if (rr == rank) continue;
-            boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid));
-        }
-    }
-
-    std::cout.flush();
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Optional: print a compact summary of reverse map for verification (limited output)
-    for(int i = 0; i < world_size; i++) {
-        if (rank == i && print_info) {
-            std::cout << std::endl;
-            for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
-
-                size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
-                if (boundary_elem_targets[elem_lid].empty()) 
-                {
-                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl;
-                }
-                else
-                {
-                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: ";
-                    int shown = 0;
-                    for (const auto &pr : boundary_elem_targets[elem_lid]) {
-                        if (shown >= 12) { std::cout << " ..."; break; }
-                        std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
-                        shown++;
-                    }
-                    std::cout << std::endl;
-                }
-            }
-            std::cout.flush();
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-    print_info = false;
-
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-
-
-
-
-    for(int i = 0; i < world_size; i++) {
-        if(rank == i && print_info) {
-            print_rank_mesh_info(intermediate_mesh, i);
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
+    partition_mesh(initial_mesh, final_mesh, initial_node, final_node, world_size, rank);
 
     // write_vtk(intermediate_mesh, intermediate_node, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
     write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD);
-
-
     MPI_Barrier(MPI_COMM_WORLD);
 
     // Stop timer and get execution time
@@ -2023,7 +78,5 @@ int main(int argc, char** argv) {
     MATAR_FINALIZE();
     MPI_Finalize();
 
-
-
     return 0;
 }
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 7e6f6c83..10d8838f 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -116,7 +116,6 @@ inline int PointIndexFromIJK(int i, int j, int k, const int* order)
 /////////////////////////////////////////////////////////////////////////////
 void build_3d_box(
     Mesh_t& mesh,
-    GaussPoint_t& GaussPoints,
     node_t&   node,
     double origin[3],
     double length[3],
@@ -252,26 +251,7 @@ void build_3d_box(
 
         // ---- Update host data ----
 
-        // material point values
-        // State.MaterialPoints.den.update_host();
-        // State.MaterialPoints.pres.update_host();
-        // State.MaterialPoints.stress.update_host();
-        // State.MaterialPoints.sspd.update_host();
-        // State.MaterialPoints.sie.update_host();
-        // State.MaterialPoints.mass.update_host();
-        // State.MaterialPoints.conductivity.update_host();
-        // State.MaterialPoints.temp_grad.update_host();
-        // State.MaterialPoints.eroded.update_host();
-
-
-        // gauss point values
-        // State.GaussPoints.vol.update_host();
-
-        // nodal values
         node.coords.update_host();
-        // State.node.vel.update_host();
-        // State.node.mass.update_host();
-        // State.node.temp.update_host();
 
         Kokkos::fence();
 

From 3ac20b2e6a509a9c4455f628ba13cf76951762c3 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 3 Nov 2025 14:42:23 -0600
Subject: [PATCH 15/52] ENH: Developing communication plan, WIP

---
 examples/mesh_decomp/communication_plan.h | 572 ++++++++++++++++++++++
 examples/mesh_decomp/decomp_utils.h       | 266 ++++++++--
 examples/mesh_decomp/mesh_decomp.cpp      |   3 +-
 examples/mesh_decomp/mesh_io.h            |   5 +-
 4 files changed, 797 insertions(+), 49 deletions(-)
 create mode 100644 examples/mesh_decomp/communication_plan.h

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
new file mode 100644
index 00000000..83d2cb46
--- /dev/null
+++ b/examples/mesh_decomp/communication_plan.h
@@ -0,0 +1,572 @@
+/**
+ * @struct CommunicationPlan
+ * @brief Manages efficient MPI communication for ghost element and node data exchange
+ * 
+ * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency.
+ * Designed to be embedded in distributed data structures for automatic ghost synchronization.
+ * 
+ * Usage pattern in distributed structures:
+ *   node.velocity.comm()  -> automatically syncs ghost nodes
+ *   elem.density.comm()   -> automatically syncs ghost elements
+ * 
+ * Memory layout philosophy:
+ * - Only std::vector<POD types> (int, size_t, double)
+ * - CSR-style indexing for variable-length per-rank data
+ * - No std::map, std::set, std::pair, or nested containers
+ * - Pre-allocated MPI buffers to avoid repeated allocations
+ * - Separate element and node communication plans
+ */
+ struct CommunicationPlan {
+    
+    // ========================================================================
+    // CORE DATA STRUCTURES - FLAT ARRAYS ONLY
+    // ========================================================================
+
+
+    // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes)
+    int num_send_ranks;                            // Number of destination ranks
+    std::vector<int> send_rank_ids;                // [size: num_send_ranks] Destination rank IDs
+    std::vector<int> send_ghost_offsets;            // [size: num_send_ranks+1] CSR offsets into send_ghost_lids
+    std::vector<int> send_ghost_lids;               // [size: total_send_ghosts] Local IDs of owned elements/nodes to send
+    std::vector<size_t> send_ghost_gids;            // [size: total_send_ghosts] Global IDs (for debug/validation)
+    
+    // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes)
+    int num_recv_ranks;                            // Number of source ranks
+    std::vector<int> recv_rank_ids;                // [size: num_recv_ranks] Source rank IDs
+    std::vector<int> recv_ghost_offsets;            // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids
+    std::vector<int> recv_ghost_lids;               // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned)
+    std::vector<size_t> recv_ghost_gids;            // [size: total_recv_ghosts] Global IDs
+
+    
+    // --- MPI Communication Buffers (pre-allocated, reusable) ---
+    std::vector<double> ghost_send_buffer;          // Flat buffer for ghost data
+    std::vector<double> ghost_recv_buffer;          // Flat buffer for ghost data
+    
+    std::vector<MPI_Request> send_requests;        // Request handles for sends
+    std::vector<MPI_Request> recv_requests;        // Request handles for receives
+    std::vector<MPI_Status> mpi_statuses;          // Status array for MPI_Waitall
+    
+    // --- Persistent communication (optional optimization) ---
+    std::vector<MPI_Request> persistent_send_requests;
+    std::vector<MPI_Request> persistent_recv_requests;
+    bool has_persistent_comm;
+    
+    
+    // --- Distributed Graph Topology for Neighborhood Collectives ---
+    MPI_Comm graph_comm;                           // Graph communicator encoding sparse communication pattern
+    bool has_graph_comm;                            // Whether graph communicator is initialized
+    
+    // Counts and displacements for MPI_Neighbor_alltoallv
+    std::vector<int> send_counts;                   // [num_send_ranks] Number of items to send per neighbor
+    std::vector<int> send_displs;                   // [num_send_ranks] Displacements in send buffer
+    std::vector<int> recv_counts;                   // [num_recv_ranks] Number of items to recv per neighbor
+    std::vector<int> recv_displs;                   // [num_recv_ranks] Displacements in recv buffer
+    
+    // --- Persistent Neighborhood Collectives (MPI-4.0+) ---
+    MPI_Request persistent_neighbor_request;        // Persistent request for neighborhood collective
+    bool has_persistent_neighbor;                   // Whether persistent neighborhood is initialized
+    int persistent_num_fields;                      // Fields per item for persistent request
+    
+    
+    // ========================================================================
+    // CONSTRUCTOR / INITIALIZATION
+    // ========================================================================
+    
+    CommunicationPlan() 
+        : num_send_ranks(0), num_recv_ranks(0),
+          has_persistent_comm(false),
+          has_graph_comm(false),
+          has_persistent_neighbor(false),
+          graph_comm(MPI_COMM_NULL),
+          persistent_neighbor_request(MPI_REQUEST_NULL),
+          persistent_num_fields(0) {}
+    
+    
+    // Destructor to free MPI resources
+    ~CommunicationPlan() {
+        // Free persistent neighborhood collective
+        if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
+            MPI_Request_free(&persistent_neighbor_request);
+        }
+        
+        // Free graph communicator
+        if (has_graph_comm && graph_comm != MPI_COMM_NULL) {
+            MPI_Comm_free(&graph_comm);
+        }
+    }
+    
+    
+    /**
+     * @brief Build communication plan from mesh with flat array inputs
+     * @param mesh Reference to partitioned mesh (with ghost elements/nodes)
+     * @param world_size Number of MPI ranks
+     * @param my_rank Current MPI rank ID
+     * @param boundary_ghost_dest_ranks Flat array of destination ranks for boundary elements [size: sum of neighbors]
+     * @param boundary_ghsot_dest_offsets CSR offsets: boundary_ghost_dest_offsets[elem_lid] = start index in boundary_ghost_dest_ranks
+     * @param boundary_ghost_dest_gids Flat array of global ghost IDs to send [size: sum of neighbors]
+     * @param all_ghost_gids All ghost global IDs across all ranks
+     * @param all_ghost_owner_ranks Owner rank for each ghost GID
+     * 
+     * This build() function takes only flat arrays as input (no std::map, std::set, std::pair).
+     * The caller must pre-process the mesh data into flat CSR-style arrays.
+     * 
+     * Implementation:
+     * 1. Group sends/receives by rank using flat arrays and CSR indexing
+     * 2. Pre-allocate all MPI buffers
+     * 3. Store everything in contiguous memory
+     */
+    void build(
+        const Mesh_t& mesh,
+        int world_size,
+        int my_rank,
+        const int* boundary_ghost_dest_ranks,      // Flat array of dest ranks
+        const int* boundary_ghost_dest_offsets,    // CSR offsets [size: num_owned_ghosts+1]
+        const size_t* boundary_ghost_dest_gids,    // Flat array of ghost GIDs
+        const size_t* all_ghost_gids,              // All ghost GIDs
+        const int* all_ghost_owner_ranks,          // Owner ranks indexed by GID
+    );
+    
+    
+    // ========================================================================
+    // COMMUNICATION INTERFACE - FOR DISTRIBUTED DATA STRUCTURES
+    // ========================================================================
+    
+    /**
+     * @brief Pack and exchange data with automatic ghost synchronization
+     * @param data_ptr Pointer to data array [size: num_total_items * stride]
+     * @param num_fields Number of fields per item (stride)
+     * @param item_type 0=elements, 1=nodes
+     * @param comm MPI communicator
+     * @param blocking If true, waits for completion before returning
+     * 
+     * This is the main interface for distributed structures like:
+     *   node.velocity.comm()  internally calls:
+     *     comm_plan.communicate(node.velocity.data(), 3, 1, MPI_COMM_WORLD, true)
+     */
+    void communicate(double* data_ptr, int num_fields, int item_type, 
+                    MPI_Comm comm = MPI_COMM_WORLD, bool blocking = true);
+    
+    
+    /**
+     * @brief Non-blocking version: initiate communication
+     * Returns immediately; user must call wait_communication()
+     */
+    void communicate_begin(double* data_ptr, int num_fields, int item_type,
+                          MPI_Comm comm = MPI_COMM_WORLD);
+    
+    
+    /**
+     * @brief Wait for non-blocking communication to complete
+     */
+    void wait_communication(double* data_ptr, int num_fields, int item_type);
+    
+    
+    // ========================================================================
+    // LOW-LEVEL PACK/UNPACK (for manual control)
+    // ========================================================================
+    
+    /**
+     * @brief Pack element data from contiguous array into send buffer
+     * @param data_ptr Pointer to element data [size: num_total_elems * num_fields]
+     * @param num_fields Stride (fields per element)
+     * 
+     * Packs data in layout: [elem0_field0, elem0_field1, ..., elem1_field0, ...]
+     */
+    void pack_ghosts(const double* data_ptr, int num_fields, int field_dimension);
+    
+    
+    /**
+     * @brief Unpack received element data into ghost elements
+     */
+    void unpack_ghosts(double* data_ptr, int num_fields, int field_dimension);
+    
+    
+    
+    // ========================================================================
+    // MPI EXCHANGE PRIMITIVES
+    // ========================================================================
+    
+    /**
+     * @brief Execute MPI_Isend/Irecv for elements
+     */
+    void exchange_ghosts_begin(int num_fields, int field_dimension, MPI_Comm comm = MPI_COMM_WORLD);
+    
+    
+    /**
+     * @brief Wait for element exchange to complete
+     */
+    void exchange_ghosts_wait();
+    
+    
+    
+    // ========================================================================
+    // PERSISTENT COMMUNICATION (OPTIMIZATION)
+    // ========================================================================
+    
+    /**
+     * @brief Setup persistent MPI communication handles (one-time setup)
+     * Call once after build(), then use start_persistent/wait_persistent
+     */
+    void init_persistent(int elem_fields, int node_fields, MPI_Comm comm = MPI_COMM_WORLD);
+    
+    
+    /**
+     * @brief Start persistent send/recv (must call pack_* first)
+     */
+    void start_persistent();
+    
+    
+    /**
+     * @brief Wait for persistent communication (then call unpack_*)
+     */
+    void wait_persistent();
+    
+    
+    /**
+     * @brief Free persistent communication handles
+     */
+    void free_persistent();
+    
+    
+    // ========================================================================
+    // NEIGHBORHOOD COLLECTIVES (MPI-3.0+)
+    // ========================================================================
+    
+    /**
+     * @brief Create distributed graph communicator from communication pattern
+     * 
+     * Call this ONCE after populating send_rank_ids and recv_rank_ids.
+     * The graph communicator encodes the sparse communication topology and is
+     * reused for all subsequent neighborhood collective calls.
+     * 
+     * @param base_comm Base communicator (usually MPI_COMM_WORLD)
+     * 
+     * Example from your output:
+     *   rank 0 sends to: {2, 3, 4, 10, 11}
+     *   rank 0 receives from: {computed from ghost ownership}
+     * 
+     * This creates a directed graph where edges represent communication channels.
+     * MPI can optimize routing and minimize network contention.
+     * 
+     * Requirements: MPI-3.0+ (2012)
+     */
+    void create_graph_communicator(MPI_Comm base_comm = MPI_COMM_WORLD);
+    
+    
+    /**
+     * @brief Exchange ghost data using MPI_Neighbor_alltoallv
+     * 
+     * Uses the pre-created graph communicator for efficient sparse communication.
+     * This is cleaner than manual Isend/Irecv loops and allows MPI to optimize.
+     * 
+     * @param data_ptr Pointer to data array [size: num_total_items * num_fields]
+     * @param num_fields Number of fields per item (e.g., 3 for velocity)
+     * 
+     * Workflow:
+     * 1. Pack owned items into send buffer
+     * 2. Call MPI_Neighbor_alltoallv (blocking but fast with graph_comm)
+     * 3. Unpack ghost items from receive buffer
+     * 
+     * The graph_comm is reused each call - only pack/unpack overhead per timestep.
+     * 
+     * Requirements: Must call create_graph_communicator() once before using this.
+     */
+    void exchange_ghosts_neighborhood(double* data_ptr, int num_fields);
+    
+    
+    /**
+     * @brief Initialize persistent neighborhood collective (MPI-4.0+)
+     * 
+     * Creates a persistent MPI request that pre-allocates all internal buffers
+     * and communication paths. Provides maximum performance for repeated exchanges
+     * with the same num_fields.
+     * 
+     * @param num_fields Number of fields per item (must be same for all timesteps)
+     * 
+     * Call once during setup:
+     *   comm_plan.create_graph_communicator(MPI_COMM_WORLD);
+     *   comm_plan.init_persistent_neighborhood(3);  // For 3D velocity
+     * 
+     * Then use exchange_ghosts_persistent() each timestep.
+     * 
+     * Requirements: MPI-4.0+ (2021). Check with: mpirun --version
+     */
+    void init_persistent_neighborhood(int num_fields);
+    
+    
+    /**
+     * @brief Exchange ghosts using persistent neighborhood collective (FASTEST)
+     * 
+     * Must call init_persistent_neighborhood() once before using this.
+     * This is the fastest ghost exchange method for fixed communication patterns.
+     * 
+     * @param data_ptr Pointer to data array [size: num_total_items * num_fields]
+     * 
+     * Workflow:
+     * 1. Pack data into same send buffer used during init
+     * 2. MPI_Start() - extremely fast, no setup overhead
+     * 3. MPI_Wait() - wait for completion
+     * 4. Unpack from receive buffer
+     * 
+     * Typical speedup vs standard neighborhood: 1.2-1.5x
+     * 
+     * Note: Falls back to exchange_ghosts_neighborhood() if MPI-4 unavailable.
+     */
+    void exchange_ghosts_persistent(double* data_ptr);
+    
+    
+    /**
+     * @brief Free persistent neighborhood collective resources
+     * 
+     * Call at end of simulation to release MPI resources.
+     * Automatically called by destructor if not explicitly freed.
+     */
+    void free_persistent_neighborhood();
+    
+    
+    // ========================================================================
+    // UTILITIES
+    // ========================================================================
+    
+    void print_summary(int rank) const;
+    bool validate(MPI_Comm comm = MPI_COMM_WORLD) const;
+    size_t send_volume(int elem_fields, int node_fields) const;
+    size_t recv_volume(int elem_fields, int node_fields) const;
+    bool needs_communication() const;
+    int num_neighbor_ranks() const;
+    
+    
+    // ========================================================================
+    // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES
+    // ========================================================================
+    
+    /**
+     * @brief Create distributed graph communicator from communication pattern
+     */
+    inline void create_graph_communicator(MPI_Comm base_comm) {
+        
+        if (has_graph_comm) {
+            std::cerr << "Warning: Graph communicator already created, skipping." << std::endl;
+            return;
+        }
+        
+        int indegree = num_recv_ranks;   // Number of ranks we receive FROM
+        int outdegree = num_send_ranks;  // Number of ranks we send TO
+        
+        // Create the distributed graph communicator
+        // MPI_Dist_graph_create_adjacent signature:
+        //   (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights,
+        //    info, reorder, comm_dist_graph)
+        int reorder = 0;  // Don't reorder ranks (keep same as base_comm)
+        
+        MPI_Dist_graph_create_adjacent(
+            base_comm,                    // Base communicator
+            indegree,                     // We receive from num_recv_ranks neighbors
+            recv_rank_ids.data(),         // Source ranks (we receive from these)
+            MPI_UNWEIGHTED,               // No edge weights for sources
+            outdegree,                    // We send to num_send_ranks neighbors
+            send_rank_ids.data(),         // Destination ranks (we send to these)
+            MPI_UNWEIGHTED,               // No edge weights for destinations
+            MPI_INFO_NULL,                // No special hints
+            reorder,                      // Don't reorder ranks
+            &graph_comm                   // Output: new graph communicator
+        );
+        
+        has_graph_comm = true;
+        
+        // Pre-allocate counts and displacements arrays
+        send_counts.resize(num_send_ranks);
+        send_displs.resize(num_send_ranks);
+        recv_counts.resize(num_recv_ranks);
+        recv_displs.resize(num_recv_ranks);
+    }
+    
+    
+    /**
+     * @brief Exchange ghost data using MPI_Neighbor_alltoallv
+     */
+    inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) {
+        
+        if (!has_graph_comm) {
+            std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
+            return;
+        }
+        
+        // 1. Pack send buffer from owned items
+        int total_send = send_ghost_lids.size();
+        ghost_send_buffer.resize(total_send * num_fields);
+        
+        for (size_t i = 0; i < send_ghost_lids.size(); i++) {
+            int local_id = send_ghost_lids[i];
+            for (int f = 0; f < num_fields; f++) {
+                ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f];
+            }
+        }
+        
+        // 2. Update counts and displacements for this num_fields
+        for (int i = 0; i < num_send_ranks; i++) {
+            int start_idx = send_ghost_offsets[i];
+            int end_idx = send_ghost_offsets[i + 1];
+            send_counts[i] = (end_idx - start_idx) * num_fields;
+            send_displs[i] = start_idx * num_fields;
+        }
+        
+        int total_recv = recv_ghost_lids.size();
+        ghost_recv_buffer.resize(total_recv * num_fields);
+        
+        for (int i = 0; i < num_recv_ranks; i++) {
+            int start_idx = recv_ghost_offsets[i];
+            int end_idx = recv_ghost_offsets[i + 1];
+            recv_counts[i] = (end_idx - start_idx) * num_fields;
+            recv_displs[i] = start_idx * num_fields;
+        }
+        
+        // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm)
+        // MPI_Neighbor_alltoallv signature:
+        //   (sendbuf, sendcounts[], sdispls[], sendtype,
+        //    recvbuf, recvcounts[], rdispls[], recvtype, comm)
+        MPI_Neighbor_alltoallv(
+            ghost_send_buffer.data(),    // Send buffer
+            send_counts.data(),          // Send counts per neighbor
+            send_displs.data(),          // Send displacements
+            MPI_DOUBLE,                  // Send type
+            ghost_recv_buffer.data(),    // Receive buffer
+            recv_counts.data(),          // Receive counts per neighbor
+            recv_displs.data(),          // Receive displacements
+            MPI_DOUBLE,                  // Receive type
+            graph_comm                   // Graph communicator (NOT MPI_COMM_WORLD!)
+        );
+        
+        // 4. Unpack receive buffer into ghost items
+        for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
+            int ghost_local_id = recv_ghost_lids[i];
+            for (int f = 0; f < num_fields; f++) {
+                data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f];
+            }
+        }
+    }
+    
+    
+    /**
+     * @brief Initialize persistent neighborhood collective (MPI-4.0+)
+     */
+    inline void init_persistent_neighborhood(int num_fields) {
+        
+        if (!has_graph_comm) {
+            std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
+            return;
+        }
+        
+        if (has_persistent_neighbor) {
+            std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl;
+            free_persistent_neighborhood();
+        }
+        
+        persistent_num_fields = num_fields;
+        
+        // Allocate buffers
+        int total_send = send_ghost_lids.size();
+        int total_recv = recv_ghost_lids.size();
+        ghost_send_buffer.resize(total_send * num_fields);
+        ghost_recv_buffer.resize(total_recv * num_fields);
+        
+        // Setup counts and displacements for persistent request
+        for (int i = 0; i < num_send_ranks; i++) {
+            int start_idx = send_ghost_offsets[i];
+            int end_idx = send_ghost_offsets[i + 1];
+            send_counts[i] = (end_idx - start_idx) * num_fields;
+            send_displs[i] = start_idx * num_fields;
+        }
+        
+        for (int i = 0; i < num_recv_ranks; i++) {
+            int start_idx = recv_ghost_offsets[i];
+            int end_idx = recv_ghost_offsets[i + 1];
+            recv_counts[i] = (end_idx - start_idx) * num_fields;
+            recv_displs[i] = start_idx * num_fields;
+        }
+        
+#if MPI_VERSION >= 4
+        // MPI-4.0+ persistent neighborhood collective
+        // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request):
+        //   (sendbuf, sendcounts[], sdispls[], sendtype,
+        //    recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request)
+        MPI_Neighbor_alltoallv_init(
+            ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE,
+            ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE,
+            graph_comm,
+            MPI_INFO_NULL,
+            &persistent_neighbor_request
+        );
+        has_persistent_neighbor = true;
+#else
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (rank == 0) {
+            std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl;
+            std::cerr << "         Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl;
+            std::cerr << "         Will fall back to standard neighborhood collective" << std::endl;
+        }
+        has_persistent_neighbor = false;
+#endif
+    }
+    
+    
+    /**
+     * @brief Exchange ghosts using persistent neighborhood collective (FASTEST)
+     */
+    inline void exchange_ghosts_persistent(double* data_ptr) {
+        
+#if MPI_VERSION >= 4
+        if (!has_persistent_neighbor) {
+            std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl;
+            std::cerr << "       Falling back to standard neighborhood collective..." << std::endl;
+            exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
+            return;
+        }
+        
+        // 1. Pack send buffer (same memory location as during init)
+        for (size_t i = 0; i < send_ghost_lids.size(); i++) {
+            int local_id = send_ghost_lids[i];
+            for (int f = 0; f < persistent_num_fields; f++) {
+                ghost_send_buffer[i * persistent_num_fields + f] = 
+                    data_ptr[local_id * persistent_num_fields + f];
+            }
+        }
+        
+        // 2. Start persistent request (VERY fast - no setup overhead)
+        MPI_Start(&persistent_neighbor_request);
+        
+        // 3. Wait for completion
+        MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE);
+        
+        // 4. Unpack receive buffer
+        for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
+            int ghost_id = recv_ghost_lids[i];
+            for (int f = 0; f < persistent_num_fields; f++) {
+                data_ptr[ghost_id * persistent_num_fields + f] = 
+                    ghost_recv_buffer[i * persistent_num_fields + f];
+            }
+        }
+#else
+        // Fallback to standard method if MPI-4 not available
+        exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
+#endif
+    }
+    
+    
+    /**
+     * @brief Free persistent neighborhood collective resources
+     */
+    inline void free_persistent_neighborhood() {
+#if MPI_VERSION >= 4
+        if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
+            MPI_Request_free(&persistent_neighbor_request);
+            persistent_neighbor_request = MPI_REQUEST_NULL;
+            has_persistent_neighbor = false;
+        }
+#endif
+    }
+    
+};
+
+
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 0357b6a6..26dd83c6 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -19,6 +19,11 @@
 #include "scotch.h"
 #include "ptscotch.h"
 
+
+
+
+
+
 void partition_mesh(
     Mesh_t& initial_mesh,
     Mesh_t& final_mesh,
@@ -44,7 +49,7 @@ void partition_mesh(
 
     int num_nodes_per_elem = 0;
 
-    std::vector<int> elements_on_rank;  
+    
     std::vector<int> nodes_on_rank;
 
 
@@ -64,14 +69,10 @@ void partition_mesh(
     std::vector<std::vector<double>> node_pos_on_rank(world_size);
 
 
-
-
     if (rank == 0) {
 
         num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
 
-        // Compute elements to send to each rank; handle remainders for non-even distribution
-
         // Compute elements to send to each rank; handle remainders for non-even distribution
         std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size);
         int remainder = initial_mesh.num_elems % world_size;
@@ -80,6 +81,8 @@ void partition_mesh(
         }
     }
 
+    // Broadcasts the value of num_nodes_per_elem from the root rank (0) to all other ranks in MPI_COMM_WORLD.
+    // After this call, all ranks will have the same value for num_nodes_per_elem.
     MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -98,78 +101,74 @@ void partition_mesh(
     
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Resize the elements_on_rank vector to hold the received data
-    elements_on_rank.resize(num_elements_on_rank);
-    
 
+    // Vector of element to send to each rank using a naive partitioning (0-m, m-n, n-o, etc.)
+    std::vector<int> elements_on_rank(num_elements_on_rank);  
     MPI_Barrier(MPI_COMM_WORLD);
     double t_scatter_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the number of elements to each rank"<<std::endl;
-        std::cout << " Scatter operation took " << (t_scatter_end - t_scatter_start) << " seconds." << std::endl;
-    }
-
 
-    // ********************************************************  
-    //     Scatter the actual element global ids to each rank
-    // ******************************************************** 
+// ********************************************************  
+//     Scatter the actual element global ids to each rank
+// ******************************************************** 
     double t_scatter_gids_start = MPI_Wtime();
 
     if (rank == 0) {
 
-        //print elements per rank
-        std::cout<<std::endl;
+        // Populate the elements_to_send array by finding all elements in the elements_per_rank array and adding them to the elements_to_send array
+    
         int elem_gid = 0;
-        for (int i = 0; i < world_size; i++) {
+        for (int rank = 0; rank < world_size; rank++) {
 
-            for (int j = 0; j < elems_per_rank[i]; j++) {
-                elements_to_send[i].push_back(elem_gid);
+            for (int j = 0; j < elems_per_rank[rank]; j++) {
+                elements_to_send[rank].push_back(elem_gid);
                 elem_gid++;
             }
         }
 
-        if (print_info) {
-            for (int i = 0; i < world_size; i++) {
-                std::cout<<std::endl;
-                std::cout<<"Rank "<<i<<" will get "<<elems_per_rank[i]<<" elements: ";
-                for (int j = 0; j < elems_per_rank[i]; j++) {
-                    std::cout<<elements_to_send[i][j]<<" ";
-                }
-            }
-            std::cout<<std::endl;
-        }
-
         // Prepare data for MPI_Scatterv (scatter with variable counts)
         // Flatten the 2D elements_to_send into a 1D array
-        std::vector<int> all_elements;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
+        std::vector<int> all_elements; // array of all elements to be sent to each rank
+        std::vector<int> sendcounts(world_size); // array of the number of elements to send to each rank
+        std::vector<int> displs(world_size); // array of the displacement for each rank in the flattened array
         
-        int displacement = 0;
+        int displacement = 0; // displacement is the starting index of the elements for the current rank in the flattened array
         for (int i = 0; i < world_size; i++) {
-            sendcounts[i] = elems_per_rank[i];
-            displs[i] = displacement;
+            sendcounts[i] = elems_per_rank[i]; // number of elements to send to each rank
+            displs[i] = displacement; // displacement for each rank in the flattened array
             // Copy elements for rank i to the flattened array
             for (int j = 0; j < elems_per_rank[i]; j++) {
-                all_elements.push_back(elements_to_send[i][j]);
+                all_elements.push_back(elements_to_send[i][j]); // add the elements to the flattened array
             }
-            displacement += elems_per_rank[i];
+            displacement += elems_per_rank[i]; // increment the displacement by the number of elements to send to the next rank
         }
 
         // Send the elements to each rank
+        // all_elements.data(): Pointer to the flattened array of all elements to be sent to each rank
+        // sendcounts.data(): Array with the number of elements to send to each rank
+        // displs.data(): Array with the displacement for each rank in the flattened array
+        // MPI_INT: Data type of the elements (integer)
+        // elements_on_rank.data(): Pointer to the buffer where each rank will receive its elements
+        // num_elements_on_rank: Number of elements that the receiving rank expects to receive
+        // MPI_INT: Data type of the receive buffer (integer)
+        // 0: The root rank (rank 0) that is performing the scatter
+        // MPI_COMM_WORLD: The communicator
         MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT,
                     elements_on_rank.data(), num_elements_on_rank, MPI_INT,
                     0, MPI_COMM_WORLD);
     } 
     else {
+        // If the rank is not the root rank, it will receive nullptr for the sendbuf, sendcounts, and displs arrays
         MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
                     elements_on_rank.data(), num_elements_on_rank, MPI_INT,
                     0, MPI_COMM_WORLD);
     }
 
+    // Wait for all ranks to complete the scatter operation
     MPI_Barrier(MPI_COMM_WORLD);
+
+    // Timer: End measuring time for scattering element global ids
     double t_scatter_gids_end = MPI_Wtime();
-    if(rank == 0) {
+    if(rank == 0 && print_info) {
         std::cout<<" Finished scattering the actual element global ids to each rank"<<std::endl;
         std::cout << " Scattering the actual element global ids to each rank took " 
                 << (t_scatter_gids_end - t_scatter_gids_start) << " seconds." << std::endl;
@@ -188,8 +187,6 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
 
 
-
-
 // ****************************************************************************************** 
 //     Scatter the number of nodes to each rank and compute which nodes to send to each rank
 // ****************************************************************************************** 
@@ -1641,6 +1638,7 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
     // Sequential rank-wise printing of extended mesh structure info
     for (int r = 0; r < world_size; ++r) {
+        MPI_Barrier(MPI_COMM_WORLD);
         if (rank == r) {
             std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
             std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
@@ -1743,7 +1741,7 @@ void partition_mesh(
 // ****************************************************************************************** 
 
 
- final_node.initialize(total_extended_nodes, 3, {node_state::coords});
+    final_node.initialize(total_extended_nodes, 3, {node_state::coords});
     
     // The goal here is to populate final_node.coords using globally gathered ghost node coordinates,
     // since intermediate_node does not contain ghost node coordinates.
@@ -1880,6 +1878,7 @@ void partition_mesh(
     
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl;
+    
     // Build map gid -> ranks that ghost it
     std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
     gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
@@ -1906,6 +1905,10 @@ void partition_mesh(
 
     std::cout.flush();
     MPI_Barrier(MPI_COMM_WORLD);
+
+    
+    
+
     // Optional: print a compact summary of reverse map for verification (limited output)
     for(int i = 0; i < world_size; i++) {
         if (rank == i && print_info) {
@@ -1929,7 +1932,76 @@ void partition_mesh(
                     std::cout << std::endl;
                 }
             }
-            std::cout.flush();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
+    // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
+    std::vector<int> boundary_elem_local_ids;
+    std::vector<std::vector<int>> boundary_to_ghost_ranks;  // ragged array dimensions (num_boundary_elems, num_ghost_ranks)
+
+    std::set<int> ghost_comm_ranks; // set of ranks that this rank communicates with
+    
+
+    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+
+        int local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
+        if (boundary_elem_targets[elem_lid].empty()) 
+        {
+            continue;
+        }
+        else
+        {
+            // Fill in vector of boundary local_ids
+            boundary_elem_local_ids.push_back(elem_lid);
+            std::vector<int> ghost_ranks_for_this_boundary_elem;
+            for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                ghost_ranks_for_this_boundary_elem.push_back(pr.first);
+                ghost_comm_ranks.insert(pr.first);
+            }
+            boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem);
+        }
+    }
+
+    int num_ghost_comm_ranks = ghost_comm_ranks.size();
+    std::vector<int> ghost_comm_ranks_vec(num_ghost_comm_ranks);
+    int i = 0;
+    for (const auto &r : ghost_comm_ranks) {
+        ghost_comm_ranks_vec[i] = r;
+        i++;
+    }
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (int r = 0; r < world_size; ++r) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << std::endl;
+            std::cout << "[rank " << rank << "] communicates to ranks: ";
+            for (int i = 0; i < num_ghost_comm_ranks; ++i) {
+                std::cout << ghost_comm_ranks_vec[i] << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    // Print out the boundary element local ids on each rank sequentially
+    for (int r = 0; r < world_size; ++r) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r && print_info) {
+            std::cout << std::endl;
+            std::cout << "[rank " << rank << "] Boundary element global ids: " <<std::endl;
+            for (int bid = 0; bid < boundary_elem_local_ids.size(); ++bid) {
+                size_t global_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(boundary_elem_local_ids[bid]);
+                std::cout <<"Boundary element local id: " << boundary_elem_local_ids[bid] << " - Global id: " << global_elem_gid << " - Ghost ranks: ";
+                for (int j = 0; j < boundary_to_ghost_ranks[bid].size(); ++j) {
+                    std::cout << boundary_to_ghost_ranks[bid][j] << " ";
+                }
+                std::cout << std::endl;
+            }
         }
         MPI_Barrier(MPI_COMM_WORLD);
     }
@@ -1943,13 +2015,115 @@ void partition_mesh(
 
 
 
-}
 
 
 
 
+    // --------------------------------------------------------------------------------------
+    // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost node GIDs.
+    // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_node_targets(intermediate_mesh.num_nodes);
+    
+    // Prepare local ghost node list as vector
+    std::vector<size_t> ghost_node_gids_vec;
+    ghost_node_gids_vec.reserve(ghost_only_nodes.size());
+    for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g);
+    
+    // Exchange counts
+    std::vector<int> ghost_node_counts(world_size, 0);
+    int local_ghost_node_count = static_cast<int>(ghost_node_gids_vec.size());
+    MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    
+    // Displacements and recv buffer
+    std::vector<int> ghost_node_displs(world_size, 0);
+    int total_ghost_nodes = 0;
+    for (int r = 0; r < world_size; ++r) {
+        ghost_node_displs[r] = total_ghost_nodes;
+        total_ghost_nodes += ghost_node_counts[r];
+    }
+    std::vector<size_t> all_ghost_node_gids(total_ghost_nodes);
+    
+    // Gather ghost node gids
+    MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG,
+                   all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl;
+    
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl;
+    
+    // Build map node_gid -> ranks that ghost it
+    std::unordered_map<size_t, std::vector<int>> node_gid_to_ghosting_ranks;
+    node_gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghost_nodes));
+    for (int r = 0; r < world_size; ++r) {
+        int cnt = ghost_node_counts[r];
+        int off = ghost_node_displs[r];
+        for (int i = 0; i < cnt; ++i) {
+            size_t g = all_ghost_node_gids[off + i];
+            node_gid_to_ghosting_ranks[g].push_back(r);
+        }
+    }
+    
+    // For each local node, list destinations: ranks that ghost our node gid
+    for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
+        size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+        auto it = node_gid_to_ghosting_ranks.find(local_node_gid);
+        if (it == node_gid_to_ghosting_ranks.end()) continue;
+        const std::vector<int> &dest_ranks = it->second;
+        for (int rr : dest_ranks) {
+            if (rr == rank) continue;
+            boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid));
+        }
+    }
+    
+    std::cout.flush();
+    MPI_Barrier(MPI_COMM_WORLD);
+    print_info = false;
+    
+    // Optional: print a compact summary of node reverse map for verification (limited output)
+    for(int i = 0; i < world_size; i++) {
+        if (rank == i && print_info) {
+            std::cout << std::endl;
+            for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
+                
+                size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+                if (boundary_node_targets[node_lid].empty()) 
+                {
+                    std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl;
+                }
+                else
+                {
+                    std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: ";
+                    int shown = 0;
+                    for (const auto &pr : boundary_node_targets[node_lid]) {
+                        if (shown >= 12) { std::cout << " ..."; break; }
+                        std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
+                        shown++;
+                    }
+                    std::cout << std::endl;
+                }
+            }
+            std::cout.flush();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+    print_info = false;
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
 
 
+}
+
 
 
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index bc3e8371..608c3867 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {100, 100, 100};
+    int num_elems_dim[3] = {50, 50, 50};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
@@ -65,6 +65,7 @@ int main(int argc, char** argv) {
     // write_vtk(intermediate_mesh, intermediate_node, rank);
     MPI_Barrier(MPI_COMM_WORLD);
     write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD);
+    // write_vtk(final_mesh, final_node, rank);
     MPI_Barrier(MPI_COMM_WORLD);
 
     // Stop timer and get execution time
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 10d8838f..f0801777 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -515,7 +515,7 @@ void write_vtu(Mesh_t& mesh,
     const int num_cell_vec_vars    = 0;
     const int num_cell_tensor_vars = 0;
 
-    const int num_point_scalar_vars = 2;
+    const int num_point_scalar_vars = 3;
     const int num_point_vec_vars = 1;
 
     // Scalar values associated with a cell
@@ -524,7 +524,7 @@ void write_vtu(Mesh_t& mesh,
     };
 
     const char point_scalar_var_names[num_point_scalar_vars][15] = {
-        "rank_id", "elems_in_node"
+        "rank_id", "elems_in_node", "global_node_id"
     };
 
     const char point_vec_var_names[num_point_vec_vars][15] = {
@@ -557,6 +557,7 @@ void write_vtu(Mesh_t& mesh,
 
         point_scalar_fields(node_gid, 0) = rank;
         point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
+        point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid);
     }
 
     // File management

From 285460bd3bc37c186a11f76c8ff00f659d3be1c8 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 3 Nov 2025 15:27:51 -0600
Subject: [PATCH 16/52] ENH: Creating MPI types, WIP

---
 examples/mesh_decomp/communication_plan.h | 302 +++---------------
 examples/mesh_decomp/decomp_utils.h       |  18 ++
 examples/mesh_decomp/mpi_type.h           | 360 ++++++++++++++++++++++
 examples/mesh_decomp/state.h              |   2 +
 4 files changed, 419 insertions(+), 263 deletions(-)
 create mode 100644 examples/mesh_decomp/mpi_type.h

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index 83d2cb46..32833e1a 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -9,12 +9,6 @@
  *   node.velocity.comm()  -> automatically syncs ghost nodes
  *   elem.density.comm()   -> automatically syncs ghost elements
  * 
- * Memory layout philosophy:
- * - Only std::vector<POD types> (int, size_t, double)
- * - CSR-style indexing for variable-length per-rank data
- * - No std::map, std::set, std::pair, or nested containers
- * - Pre-allocated MPI buffers to avoid repeated allocations
- * - Separate element and node communication plans
  */
  struct CommunicationPlan {
     
@@ -25,30 +19,26 @@
 
     // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes)
     int num_send_ranks;                            // Number of destination ranks
-    std::vector<int> send_rank_ids;                // [size: num_send_ranks] Destination rank IDs
-    std::vector<int> send_ghost_offsets;            // [size: num_send_ranks+1] CSR offsets into send_ghost_lids
-    std::vector<int> send_ghost_lids;               // [size: total_send_ghosts] Local IDs of owned elements/nodes to send
+    DCArrayKokkos<size_t> send_rank_ids;                // [size: num_send_ranks] Destination rank IDs
+    DCArrayKokkos<size_t> send_ghost_offsets;            // [size: num_send_ranks+1] CSR offsets into send_ghost_lids
+    DCArrayKokkos<size_t> send_ghost_lids;               // [size: total_send_ghosts] Local IDs of owned elements/nodes to send
     std::vector<size_t> send_ghost_gids;            // [size: total_send_ghosts] Global IDs (for debug/validation)
     
     // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes)
     int num_recv_ranks;                            // Number of source ranks
-    std::vector<int> recv_rank_ids;                // [size: num_recv_ranks] Source rank IDs
-    std::vector<int> recv_ghost_offsets;            // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids
-    std::vector<int> recv_ghost_lids;               // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned)
+    DCArrayKokkos<size_t> recv_rank_ids;                // [size: num_recv_ranks] Source rank IDs
+    DCArrayKokkos<size_t> recv_ghost_offsets;            // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids
+    DCArrayKokkos<size_t> recv_ghost_lids;               // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned)
     std::vector<size_t> recv_ghost_gids;            // [size: total_recv_ghosts] Global IDs
 
     
-    // --- MPI Communication Buffers (pre-allocated, reusable) ---
-    std::vector<double> ghost_send_buffer;          // Flat buffer for ghost data
-    std::vector<double> ghost_recv_buffer;          // Flat buffer for ghost data
-    
-    std::vector<MPI_Request> send_requests;        // Request handles for sends
-    std::vector<MPI_Request> recv_requests;        // Request handles for receives
-    std::vector<MPI_Status> mpi_statuses;          // Status array for MPI_Waitall
+    DCArrayKokkos<MPI_Request> send_requests;        // Request handles for sends
+    DCArrayKokkos<MPI_Request> recv_requests;        // Request handles for receives
+    DCArrayKokkos<MPI_Status> mpi_statuses;          // Status array for MPI_Waitall
     
     // --- Persistent communication (optional optimization) ---
-    std::vector<MPI_Request> persistent_send_requests;
-    std::vector<MPI_Request> persistent_recv_requests;
+    DCArrayKokkos<MPI_Request> persistent_send_requests;
+    DCArrayKokkos<MPI_Request> persistent_recv_requests;
     bool has_persistent_comm;
     
     
@@ -57,10 +47,10 @@
     bool has_graph_comm;                            // Whether graph communicator is initialized
     
     // Counts and displacements for MPI_Neighbor_alltoallv
-    std::vector<int> send_counts;                   // [num_send_ranks] Number of items to send per neighbor
-    std::vector<int> send_displs;                   // [num_send_ranks] Displacements in send buffer
-    std::vector<int> recv_counts;                   // [num_recv_ranks] Number of items to recv per neighbor
-    std::vector<int> recv_displs;                   // [num_recv_ranks] Displacements in recv buffer
+    DCArrayKokkos<size_t> send_counts;                   // [num_send_ranks] Number of items to send per neighbor
+    DCArrayKokkos<size_t> send_displs;                   // [num_send_ranks] Displacements in send buffer
+    DCArrayKokkos<size_t> recv_counts;                   // [num_recv_ranks] Number of items to recv per neighbor
+    DCArrayKokkos<size_t> recv_displs;                   // [num_recv_ranks] Displacements in recv buffer
     
     // --- Persistent Neighborhood Collectives (MPI-4.0+) ---
     MPI_Request persistent_neighbor_request;        // Persistent request for neighborhood collective
@@ -96,245 +86,31 @@
     }
     
     
-    /**
-     * @brief Build communication plan from mesh with flat array inputs
-     * @param mesh Reference to partitioned mesh (with ghost elements/nodes)
-     * @param world_size Number of MPI ranks
-     * @param my_rank Current MPI rank ID
-     * @param boundary_ghost_dest_ranks Flat array of destination ranks for boundary elements [size: sum of neighbors]
-     * @param boundary_ghsot_dest_offsets CSR offsets: boundary_ghost_dest_offsets[elem_lid] = start index in boundary_ghost_dest_ranks
-     * @param boundary_ghost_dest_gids Flat array of global ghost IDs to send [size: sum of neighbors]
-     * @param all_ghost_gids All ghost global IDs across all ranks
-     * @param all_ghost_owner_ranks Owner rank for each ghost GID
-     * 
-     * This build() function takes only flat arrays as input (no std::map, std::set, std::pair).
-     * The caller must pre-process the mesh data into flat CSR-style arrays.
-     * 
-     * Implementation:
-     * 1. Group sends/receives by rank using flat arrays and CSR indexing
-     * 2. Pre-allocate all MPI buffers
-     * 3. Store everything in contiguous memory
-     */
-    void build(
-        const Mesh_t& mesh,
-        int world_size,
-        int my_rank,
-        const int* boundary_ghost_dest_ranks,      // Flat array of dest ranks
-        const int* boundary_ghost_dest_offsets,    // CSR offsets [size: num_owned_ghosts+1]
-        const size_t* boundary_ghost_dest_gids,    // Flat array of ghost GIDs
-        const size_t* all_ghost_gids,              // All ghost GIDs
-        const int* all_ghost_owner_ranks,          // Owner ranks indexed by GID
-    );
-    
-    
-    // ========================================================================
-    // COMMUNICATION INTERFACE - FOR DISTRIBUTED DATA STRUCTURES
-    // ========================================================================
-    
-    /**
-     * @brief Pack and exchange data with automatic ghost synchronization
-     * @param data_ptr Pointer to data array [size: num_total_items * stride]
-     * @param num_fields Number of fields per item (stride)
-     * @param item_type 0=elements, 1=nodes
-     * @param comm MPI communicator
-     * @param blocking If true, waits for completion before returning
-     * 
-     * This is the main interface for distributed structures like:
-     *   node.velocity.comm()  internally calls:
-     *     comm_plan.communicate(node.velocity.data(), 3, 1, MPI_COMM_WORLD, true)
-     */
-    void communicate(double* data_ptr, int num_fields, int item_type, 
-                    MPI_Comm comm = MPI_COMM_WORLD, bool blocking = true);
-    
-    
-    /**
-     * @brief Non-blocking version: initiate communication
-     * Returns immediately; user must call wait_communication()
-     */
-    void communicate_begin(double* data_ptr, int num_fields, int item_type,
-                          MPI_Comm comm = MPI_COMM_WORLD);
-    
-    
-    /**
-     * @brief Wait for non-blocking communication to complete
-     */
-    void wait_communication(double* data_ptr, int num_fields, int item_type);
-    
-    
-    // ========================================================================
-    // LOW-LEVEL PACK/UNPACK (for manual control)
-    // ========================================================================
-    
-    /**
-     * @brief Pack element data from contiguous array into send buffer
-     * @param data_ptr Pointer to element data [size: num_total_elems * num_fields]
-     * @param num_fields Stride (fields per element)
-     * 
-     * Packs data in layout: [elem0_field0, elem0_field1, ..., elem1_field0, ...]
-     */
-    void pack_ghosts(const double* data_ptr, int num_fields, int field_dimension);
-    
-    
-    /**
-     * @brief Unpack received element data into ghost elements
-     */
-    void unpack_ghosts(double* data_ptr, int num_fields, int field_dimension);
-    
-    
-    
-    // ========================================================================
-    // MPI EXCHANGE PRIMITIVES
-    // ========================================================================
-    
-    /**
-     * @brief Execute MPI_Isend/Irecv for elements
-     */
-    void exchange_ghosts_begin(int num_fields, int field_dimension, MPI_Comm comm = MPI_COMM_WORLD);
-    
-    
-    /**
-     * @brief Wait for element exchange to complete
-     */
-    void exchange_ghosts_wait();
-    
-    
-    
-    // ========================================================================
-    // PERSISTENT COMMUNICATION (OPTIMIZATION)
-    // ========================================================================
-    
-    /**
-     * @brief Setup persistent MPI communication handles (one-time setup)
-     * Call once after build(), then use start_persistent/wait_persistent
-     */
-    void init_persistent(int elem_fields, int node_fields, MPI_Comm comm = MPI_COMM_WORLD);
-    
-    
-    /**
-     * @brief Start persistent send/recv (must call pack_* first)
-     */
-    void start_persistent();
-    
-    
-    /**
-     * @brief Wait for persistent communication (then call unpack_*)
-     */
-    void wait_persistent();
-    
-    
-    /**
-     * @brief Free persistent communication handles
-     */
-    void free_persistent();
-    
-    
-    // ========================================================================
-    // NEIGHBORHOOD COLLECTIVES (MPI-3.0+)
-    // ========================================================================
-    
-    /**
-     * @brief Create distributed graph communicator from communication pattern
-     * 
-     * Call this ONCE after populating send_rank_ids and recv_rank_ids.
-     * The graph communicator encodes the sparse communication topology and is
-     * reused for all subsequent neighborhood collective calls.
-     * 
-     * @param base_comm Base communicator (usually MPI_COMM_WORLD)
-     * 
-     * Example from your output:
-     *   rank 0 sends to: {2, 3, 4, 10, 11}
-     *   rank 0 receives from: {computed from ghost ownership}
-     * 
-     * This creates a directed graph where edges represent communication channels.
-     * MPI can optimize routing and minimize network contention.
-     * 
-     * Requirements: MPI-3.0+ (2012)
-     */
-    void create_graph_communicator(MPI_Comm base_comm = MPI_COMM_WORLD);
-    
-    
-    /**
-     * @brief Exchange ghost data using MPI_Neighbor_alltoallv
-     * 
-     * Uses the pre-created graph communicator for efficient sparse communication.
-     * This is cleaner than manual Isend/Irecv loops and allows MPI to optimize.
-     * 
-     * @param data_ptr Pointer to data array [size: num_total_items * num_fields]
-     * @param num_fields Number of fields per item (e.g., 3 for velocity)
-     * 
-     * Workflow:
-     * 1. Pack owned items into send buffer
-     * 2. Call MPI_Neighbor_alltoallv (blocking but fast with graph_comm)
-     * 3. Unpack ghost items from receive buffer
-     * 
-     * The graph_comm is reused each call - only pack/unpack overhead per timestep.
-     * 
-     * Requirements: Must call create_graph_communicator() once before using this.
-     */
-    void exchange_ghosts_neighborhood(double* data_ptr, int num_fields);
-    
-    
-    /**
-     * @brief Initialize persistent neighborhood collective (MPI-4.0+)
-     * 
-     * Creates a persistent MPI request that pre-allocates all internal buffers
-     * and communication paths. Provides maximum performance for repeated exchanges
-     * with the same num_fields.
-     * 
-     * @param num_fields Number of fields per item (must be same for all timesteps)
-     * 
-     * Call once during setup:
-     *   comm_plan.create_graph_communicator(MPI_COMM_WORLD);
-     *   comm_plan.init_persistent_neighborhood(3);  // For 3D velocity
-     * 
-     * Then use exchange_ghosts_persistent() each timestep.
-     * 
-     * Requirements: MPI-4.0+ (2021). Check with: mpirun --version
-     */
-    void init_persistent_neighborhood(int num_fields);
-    
-    
-    /**
-     * @brief Exchange ghosts using persistent neighborhood collective (FASTEST)
-     * 
-     * Must call init_persistent_neighborhood() once before using this.
-     * This is the fastest ghost exchange method for fixed communication patterns.
-     * 
-     * @param data_ptr Pointer to data array [size: num_total_items * num_fields]
-     * 
-     * Workflow:
-     * 1. Pack data into same send buffer used during init
-     * 2. MPI_Start() - extremely fast, no setup overhead
-     * 3. MPI_Wait() - wait for completion
-     * 4. Unpack from receive buffer
-     * 
-     * Typical speedup vs standard neighborhood: 1.2-1.5x
-     * 
-     * Note: Falls back to exchange_ghosts_neighborhood() if MPI-4 unavailable.
-     */
-    void exchange_ghosts_persistent(double* data_ptr);
-    
-    
-    /**
-     * @brief Free persistent neighborhood collective resources
-     * 
-     * Call at end of simulation to release MPI resources.
-     * Automatically called by destructor if not explicitly freed.
-     */
-    void free_persistent_neighborhood();
-    
-    
-    // ========================================================================
-    // UTILITIES
-    // ========================================================================
-    
-    void print_summary(int rank) const;
-    bool validate(MPI_Comm comm = MPI_COMM_WORLD) const;
-    size_t send_volume(int elem_fields, int node_fields) const;
-    size_t recv_volume(int elem_fields, int node_fields) const;
-    bool needs_communication() const;
-    int num_neighbor_ranks() const;
+    void initialize(int num_send_ranks, int num_recv_ranks){
+        this->num_send_ranks = num_send_ranks;
+        this->num_recv_ranks = num_recv_ranks;
+        
+        send_rank_ids = DCArrayKokkos<size_t>(num_send_ranks, "send_rank_ids");
+        recv_rank_ids = DCArrayKokkos<size_t>(num_recv_ranks, "recv_rank_ids");
+        send_ghost_offsets = DCArrayKokkos<size_t>(num_send_ranks + 1, "send_ghost_offsets");
+        recv_ghost_offsets = DCArrayKokkos<size_t>(num_recv_ranks + 1, "recv_ghost_offsets");
+        send_ghost_lids = DCArrayKokkos<size_t>(total_send_ghosts, "send_ghost_lids");
+        recv_ghost_lids = DCArrayKokkos<size_t>(total_recv_ghosts, "recv_ghost_lids");
+        send_ghost_gids = std::vector<size_t>(total_send_ghosts, "send_ghost_gids");
+        recv_ghost_gids = std::vector<size_t>(total_recv_ghosts, "recv_ghost_gids");
+        send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "send_requests");
+        recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "recv_requests");
+        mpi_statuses = DCArrayKokkos<MPI_Status>(total_send_ghosts + total_recv_ghosts, "mpi_statuses");
+        persistent_send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "persistent_send_requests");
+        persistent_recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "persistent_recv_requests");
+        send_counts = DCArrayKokkos<size_t>(num_send_ranks, "send_counts");
+        send_displs = DCArrayKokkos<size_t>(num_send_ranks, "send_displs");
+        recv_counts = DCArrayKokkos<size_t>(num_recv_ranks, "recv_counts");
+        recv_displs = DCArrayKokkos<size_t>(num_recv_ranks, "recv_displs");
+        
+    }
     
+
     
     // ========================================================================
     // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 26dd83c6..9c4267bf 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -15,6 +15,9 @@
 #include "state.h"
 #include "mesh_io.h"
 
+
+#include "communication_plan.h"
+
 // Include Scotch headers
 #include "scotch.h"
 #include "ptscotch.h"
@@ -2013,6 +2016,11 @@ void partition_mesh(
 
 
 
+    // Build communication plans for elements 
+    CommunicationPlan element_comm_plan;
+
+
+    element_comm_plan.initialize(num_send_ranks, num_recv_ranks);
 
 
 
@@ -2122,6 +2130,16 @@ void partition_mesh(
     if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
 
 
+
+
+    // Build communication plans for elements and nodes
+    CommunicationPlan element_comm_plan;
+    CommunicationPlan node_comm_plan;
+
+    element_comm_plan.build(intermediate_mesh, world_size, rank, boundary_elem_targets, boundary_elem_local_ids, boundary_to_ghost_ranks);
+    node_comm_plan.build(intermediate_mesh, world_size, rank, boundary_node_targets, boundary_node_local_ids, boundary_to_ghost_ranks);
+
+
 }
 
 
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
new file mode 100644
index 00000000..5ba78be9
--- /dev/null
+++ b/examples/mesh_decomp/mpi_type.h
@@ -0,0 +1,360 @@
+#ifndef MPIDARRAYKOKKOS_H
+#define MPIDARRAYKOKKOS_H
+
+#include "matar.h"
+#include "communication_plan.h"
+
+using namespace mtr;
+
+/////////////////////////
+// MPIDArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+// 
+// Enhanced with automatic ghost synchronization via CommunicationPlan.
+// Allocates space for owned + ghost items and provides communicate() method.
+//
+// Usage:
+//   node.coords.communicate()  -> syncs ghost nodes automatically
+//   elem.density.communicate() -> syncs ghost elements automatically
+/////////////////////////
+template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+class MPIDArrayKokkos {
+
+    // this is manage
+    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    
+protected:
+    size_t dims_[7];
+    size_t length_;
+    size_t order_;  // tensor order (rank)
+    int mpi_recv_rank_;
+    int mpi_tag_;
+    MPI_Comm mpi_comm_;
+    MPI_Status mpi_status_;
+    MPI_Datatype mpi_datatype_;
+    MPI_Request mpi_request_;
+    TArray1D this_array_;
+    
+    // --- Ghost Communication Support ---
+    CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
+    size_t num_owned_items_;            // Number of owned items (nodes/elements)
+    size_t num_total_items_;            // Total items including ghosts (owned + ghost)
+    size_t num_fields_;                 // Fields per item (e.g., 3 for 3D coordinates)
+    
+    void set_mpi_type();
+
+public:
+    // Data member to access host view
+    ViewCArray <T> host;
+
+    MPIDArrayKokkos();
+    
+    MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5,
+                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    
+    // ========================================================================
+    // DISTRIBUTED COMMUNICATION METHODS (NEW)
+    // ========================================================================
+    
+    /**
+     * @brief Set communication plan and ghost metadata
+     * 
+     * Call this ONCE after allocating the array to enable ghost communication.
+     * Multiple fields can share the same CommunicationPlan pointer.
+     * 
+     * @param plan Pointer to shared CommunicationPlan (node or element plan)
+     * @param num_owned Number of owned items on this rank
+     * @param num_total Total items including ghosts (owned + ghost)
+     * 
+     * Example:
+     *   node.coords = MPIDArrayKokkos<double>(num_total_nodes, 3);
+     *   node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes);
+     */
+    void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total);
+    
+    
+    /**
+     * @brief Synchronize ghost data using neighborhood collectives
+     * 
+     * Automatically exchanges boundary → ghost data for this field.
+     * Uses the CommunicationPlan provided via set_communication_plan().
+     * 
+     * Workflow:
+     * 1. Updates host data from device (if needed)
+     * 2. Packs owned boundary items
+     * 3. Calls MPI_Neighbor_alltoallv (via comm_plan)
+     * 4. Unpacks into ghost items
+     * 5. Updates device with new ghost data
+     * 
+     * Example usage:
+     *   // Update owned nodes
+     *   for (int i = 0; i < num_owned_nodes; i++) {
+     *       node.coords(i, 0) += dt * velocity(i, 0);
+     *   }
+     *   
+     *   // Sync ghosts
+     *   node.coords.communicate();
+     *   
+     *   // Now ghost data is current
+     */
+    void communicate();
+    
+    
+    /**
+     * @brief Non-blocking version: start ghost exchange
+     * 
+     * For advanced users who want to overlap computation with communication.
+     * Must call communicate_wait() before accessing ghost data.
+     */
+    void communicate_begin();
+    
+    
+    /**
+     * @brief Wait for non-blocking ghost exchange to complete
+     */
+    void communicate_wait();
+    
+    
+    /**
+     * @brief Get number of owned items (excludes ghosts)
+     */
+    KOKKOS_INLINE_FUNCTION
+    size_t num_owned() const { return num_owned_items_; }
+    
+    
+    /**
+     * @brief Get total items including ghosts
+     */
+    KOKKOS_INLINE_FUNCTION
+    size_t num_total() const { return num_total_items_; }
+    
+    
+    /**
+     * @brief Check if ghost communication is configured
+     */
+    bool has_communication_plan() const { return comm_plan_ != nullptr; }
+    
+    // These functions can setup the data needed for halo send/receives
+    // Not necessary for standard MPI comms
+    void mpi_setup();
+
+    void mpi_setup(int recv_rank);
+
+    void mpi_setup(int recv_rank, int tag);
+
+    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
+
+    void mpi_set_rank(int recv_rank);
+
+    void mpi_set_tag(int tag);
+
+    void mpi_set_comm(MPI_Comm comm);
+
+    int get_rank();
+
+    int get_tag();
+
+    MPI_Comm get_comm();
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n, size_t o) const;
+    
+    KOKKOS_INLINE_FUNCTION
+    MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
+
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
+
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t dims(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t order() const;
+ 
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
+
+    // Method returns the raw host pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* host_pointer() const;
+
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    TArray1D get_kokkos_dual_view() const;
+
+    // Method that update host view
+    void update_host();
+
+    // Method that update device view
+    void update_device();
+
+    
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~MPIDArrayKokkos ();
+}; // End of MPIDArrayKokkos
+
+
+// ============================================================================
+// INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION
+// ============================================================================
+
+/**
+ * @brief Default constructor - initialize ghost communication members
+ */
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::MPIDArrayKokkos() 
+    : comm_plan_(nullptr), 
+      num_owned_items_(0), 
+      num_total_items_(0), 
+      num_fields_(0) 
+{
+    // Base constructor handles array initialization
+}
+
+
+/**
+ * @brief Set communication plan and ghost metadata
+ */
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::set_communication_plan(
+    CommunicationPlan* plan, 
+    size_t num_owned, 
+    size_t num_total)
+{
+    comm_plan_ = plan;
+    num_owned_items_ = num_owned;
+    num_total_items_ = num_total;
+    
+    // Infer number of fields from array dimensions
+    // Assumption: dim0 = num_items, dim1+ = fields
+    if (order_ == 1) {
+        num_fields_ = 1;  // Scalar field
+    } else if (order_ == 2) {
+        num_fields_ = dims_[1];  // Vector field (e.g., coords[num_nodes, 3])
+    } else {
+        // For higher order tensors, treat everything after dim0 as fields
+        num_fields_ = 1;
+        for (size_t i = 1; i < order_; i++) {
+            num_fields_ *= dims_[i];
+        }
+    }
+    
+    // Validate dimensions match total items
+    if (dims_[0] != num_total) {
+        std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" 
+                  << num_total << ")" << std::endl;
+        std::cerr << "       Array must be allocated with size = num_owned + num_ghost" << std::endl;
+    }
+}
+
+
+/**
+ * @brief Synchronize ghost data using neighborhood collectives
+ */
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate()
+{
+    if (!comm_plan_) {
+        std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl;
+        return;
+    }
+    
+    if (!comm_plan_->has_graph_comm) {
+        std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl;
+        std::cerr << "       Call comm_plan.create_graph_communicator() first." << std::endl;
+        return;
+    }
+    
+    // 1. Update host from device (ensure data is current on CPU for MPI)
+    this->update_host();
+    
+    // 2. Get raw pointer to data
+    T* data_ptr = this->host_pointer();
+    
+    // 3. Convert to double* for MPI communication
+    // TODO: Support other types (int, float, etc.) with template specialization
+    static_assert(std::is_same<T, double>::value, 
+                  "Currently only double supported for ghost communication");
+    
+    double* double_ptr = reinterpret_cast<double*>(data_ptr);
+    
+    // 4. Call neighborhood collective exchange
+    comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast<int>(num_fields_));
+    
+    // 5. Update device with new ghost data
+    this->update_device();
+}
+
+
+/**
+ * @brief Non-blocking version: start ghost exchange
+ */
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_begin()
+{
+    // TODO: Implement non-blocking version using Isend/Irecv
+    // For now, just call blocking version
+    std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl;
+    communicate();
+}
+
+
+/**
+ * @brief Wait for non-blocking ghost exchange to complete
+ */
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_wait()
+{
+    // TODO: Implement non-blocking version
+    // For now, this is a no-op since communicate_begin() is blocking
+}
+
+
+#endif // MPIDARRAYKOKKOS_H
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 7a1cb676..8afb9abf 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -55,6 +55,8 @@ enum class node_state
 /////////////////////////////////////////////////////////////////////////////
 struct node_t
 {
+
+    // Replace with MPIDCArrayKokkos
     DCArrayKokkos<double> coords;     ///< Nodal coordinates
     DCArrayKokkos<double> coords_n0;  ///< Nodal coordinates at tn=0 of time integration
     

From eb377938f54e5456d1eab13c72cd22d6c63df528 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 4 Nov 2025 16:38:56 -0600
Subject: [PATCH 17/52] ENH: Testing Neighbor comms, WIP

---
 examples/mesh_decomp/communication_plan.h | 572 ++++++++++-----------
 examples/mesh_decomp/decomp_utils.h       | 489 ++++++++++++++++--
 examples/mesh_decomp/mesh.h               |  10 +-
 examples/mesh_decomp/mesh_decomp.cpp      |   6 +-
 examples/mesh_decomp/mesh_io.h            |   6 +-
 examples/mesh_decomp/mpi_type.h           | 588 +++++++++++-----------
 examples/mesh_decomp/state.h              |   8 +-
 7 files changed, 1058 insertions(+), 621 deletions(-)

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index 32833e1a..7c6f9ecb 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -1,348 +1,348 @@
-/**
- * @struct CommunicationPlan
- * @brief Manages efficient MPI communication for ghost element and node data exchange
- * 
- * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency.
- * Designed to be embedded in distributed data structures for automatic ghost synchronization.
- * 
- * Usage pattern in distributed structures:
- *   node.velocity.comm()  -> automatically syncs ghost nodes
- *   elem.density.comm()   -> automatically syncs ghost elements
- * 
- */
- struct CommunicationPlan {
+// /**
+//  * @struct CommunicationPlan
+//  * @brief Manages efficient MPI communication for ghost element and node data exchange
+//  * 
+//  * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency.
+//  * Designed to be embedded in distributed data structures for automatic ghost synchronization.
+//  * 
+//  * Usage pattern in distributed structures:
+//  *   node.velocity.comm()  -> automatically syncs ghost nodes
+//  *   elem.density.comm()   -> automatically syncs ghost elements
+//  * 
+//  */
+//  struct CommunicationPlan {
     
-    // ========================================================================
-    // CORE DATA STRUCTURES - FLAT ARRAYS ONLY
-    // ========================================================================
+//     // ========================================================================
+//     // CORE DATA STRUCTURES - FLAT ARRAYS ONLY
+//     // ========================================================================
 
 
-    // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes)
-    int num_send_ranks;                            // Number of destination ranks
-    DCArrayKokkos<size_t> send_rank_ids;                // [size: num_send_ranks] Destination rank IDs
-    DCArrayKokkos<size_t> send_ghost_offsets;            // [size: num_send_ranks+1] CSR offsets into send_ghost_lids
-    DCArrayKokkos<size_t> send_ghost_lids;               // [size: total_send_ghosts] Local IDs of owned elements/nodes to send
-    std::vector<size_t> send_ghost_gids;            // [size: total_send_ghosts] Global IDs (for debug/validation)
+//     // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes)
+//     int num_send_ranks;                            // Number of destination ranks
+//     DCArrayKokkos<size_t> send_rank_ids;                // [size: num_send_ranks] Destination rank IDs
+//     DCArrayKokkos<size_t> send_ghost_offsets;            // [size: num_send_ranks+1] CSR offsets into send_ghost_lids
+//     DCArrayKokkos<size_t> send_ghost_lids;               // [size: total_send_ghosts] Local IDs of owned elements/nodes to send
+//     std::vector<size_t> send_ghost_gids;            // [size: total_send_ghosts] Global IDs (for debug/validation)
     
-    // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes)
-    int num_recv_ranks;                            // Number of source ranks
-    DCArrayKokkos<size_t> recv_rank_ids;                // [size: num_recv_ranks] Source rank IDs
-    DCArrayKokkos<size_t> recv_ghost_offsets;            // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids
-    DCArrayKokkos<size_t> recv_ghost_lids;               // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned)
-    std::vector<size_t> recv_ghost_gids;            // [size: total_recv_ghosts] Global IDs
+//     // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes)
+//     int num_recv_ranks;                            // Number of source ranks
+//     DCArrayKokkos<size_t> recv_rank_ids;                // [size: num_recv_ranks] Source rank IDs
+//     DCArrayKokkos<size_t> recv_ghost_offsets;            // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids
+//     DCArrayKokkos<size_t> recv_ghost_lids;               // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned)
+//     std::vector<size_t> recv_ghost_gids;            // [size: total_recv_ghosts] Global IDs
 
     
-    DCArrayKokkos<MPI_Request> send_requests;        // Request handles for sends
-    DCArrayKokkos<MPI_Request> recv_requests;        // Request handles for receives
-    DCArrayKokkos<MPI_Status> mpi_statuses;          // Status array for MPI_Waitall
+//     DCArrayKokkos<MPI_Request> send_requests;        // Request handles for sends
+//     DCArrayKokkos<MPI_Request> recv_requests;        // Request handles for receives
+//     DCArrayKokkos<MPI_Status> mpi_statuses;          // Status array for MPI_Waitall
     
-    // --- Persistent communication (optional optimization) ---
-    DCArrayKokkos<MPI_Request> persistent_send_requests;
-    DCArrayKokkos<MPI_Request> persistent_recv_requests;
-    bool has_persistent_comm;
+//     // --- Persistent communication (optional optimization) ---
+//     DCArrayKokkos<MPI_Request> persistent_send_requests;
+//     DCArrayKokkos<MPI_Request> persistent_recv_requests;
+//     bool has_persistent_comm;
     
     
-    // --- Distributed Graph Topology for Neighborhood Collectives ---
-    MPI_Comm graph_comm;                           // Graph communicator encoding sparse communication pattern
-    bool has_graph_comm;                            // Whether graph communicator is initialized
+//     // --- Distributed Graph Topology for Neighborhood Collectives ---
+//     MPI_Comm graph_comm;                           // Graph communicator encoding sparse communication pattern
+//     bool has_graph_comm;                            // Whether graph communicator is initialized
     
-    // Counts and displacements for MPI_Neighbor_alltoallv
-    DCArrayKokkos<size_t> send_counts;                   // [num_send_ranks] Number of items to send per neighbor
-    DCArrayKokkos<size_t> send_displs;                   // [num_send_ranks] Displacements in send buffer
-    DCArrayKokkos<size_t> recv_counts;                   // [num_recv_ranks] Number of items to recv per neighbor
-    DCArrayKokkos<size_t> recv_displs;                   // [num_recv_ranks] Displacements in recv buffer
+//     // Counts and displacements for MPI_Neighbor_alltoallv
+//     DCArrayKokkos<size_t> send_counts;                   // [num_send_ranks] Number of items to send per neighbor
+//     DCArrayKokkos<size_t> send_displs;                   // [num_send_ranks] Displacements in send buffer
+//     DCArrayKokkos<size_t> recv_counts;                   // [num_recv_ranks] Number of items to recv per neighbor
+//     DCArrayKokkos<size_t> recv_displs;                   // [num_recv_ranks] Displacements in recv buffer
     
-    // --- Persistent Neighborhood Collectives (MPI-4.0+) ---
-    MPI_Request persistent_neighbor_request;        // Persistent request for neighborhood collective
-    bool has_persistent_neighbor;                   // Whether persistent neighborhood is initialized
-    int persistent_num_fields;                      // Fields per item for persistent request
+//     // --- Persistent Neighborhood Collectives (MPI-4.0+) ---
+//     MPI_Request persistent_neighbor_request;        // Persistent request for neighborhood collective
+//     bool has_persistent_neighbor;                   // Whether persistent neighborhood is initialized
+//     int persistent_num_fields;                      // Fields per item for persistent request
     
     
-    // ========================================================================
-    // CONSTRUCTOR / INITIALIZATION
-    // ========================================================================
+//     // ========================================================================
+//     // CONSTRUCTOR / INITIALIZATION
+//     // ========================================================================
     
-    CommunicationPlan() 
-        : num_send_ranks(0), num_recv_ranks(0),
-          has_persistent_comm(false),
-          has_graph_comm(false),
-          has_persistent_neighbor(false),
-          graph_comm(MPI_COMM_NULL),
-          persistent_neighbor_request(MPI_REQUEST_NULL),
-          persistent_num_fields(0) {}
+//     CommunicationPlan() 
+//         : num_send_ranks(0), num_recv_ranks(0),
+//           has_persistent_comm(false),
+//           has_graph_comm(false),
+//           has_persistent_neighbor(false),
+//           graph_comm(MPI_COMM_NULL),
+//           persistent_neighbor_request(MPI_REQUEST_NULL),
+//           persistent_num_fields(0) {}
     
     
-    // Destructor to free MPI resources
-    ~CommunicationPlan() {
-        // Free persistent neighborhood collective
-        if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
-            MPI_Request_free(&persistent_neighbor_request);
-        }
+//     // Destructor to free MPI resources
+//     ~CommunicationPlan() {
+//         // Free persistent neighborhood collective
+//         if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
+//             MPI_Request_free(&persistent_neighbor_request);
+//         }
         
-        // Free graph communicator
-        if (has_graph_comm && graph_comm != MPI_COMM_NULL) {
-            MPI_Comm_free(&graph_comm);
-        }
-    }
+//         // Free graph communicator
+//         if (has_graph_comm && graph_comm != MPI_COMM_NULL) {
+//             MPI_Comm_free(&graph_comm);
+//         }
+//     }
     
     
-    void initialize(int num_send_ranks, int num_recv_ranks){
-        this->num_send_ranks = num_send_ranks;
-        this->num_recv_ranks = num_recv_ranks;
+//     void initialize(int num_send_ranks, int num_recv_ranks){
+//         this->num_send_ranks = num_send_ranks;
+//         this->num_recv_ranks = num_recv_ranks;
         
-        send_rank_ids = DCArrayKokkos<size_t>(num_send_ranks, "send_rank_ids");
-        recv_rank_ids = DCArrayKokkos<size_t>(num_recv_ranks, "recv_rank_ids");
-        send_ghost_offsets = DCArrayKokkos<size_t>(num_send_ranks + 1, "send_ghost_offsets");
-        recv_ghost_offsets = DCArrayKokkos<size_t>(num_recv_ranks + 1, "recv_ghost_offsets");
-        send_ghost_lids = DCArrayKokkos<size_t>(total_send_ghosts, "send_ghost_lids");
-        recv_ghost_lids = DCArrayKokkos<size_t>(total_recv_ghosts, "recv_ghost_lids");
-        send_ghost_gids = std::vector<size_t>(total_send_ghosts, "send_ghost_gids");
-        recv_ghost_gids = std::vector<size_t>(total_recv_ghosts, "recv_ghost_gids");
-        send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "send_requests");
-        recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "recv_requests");
-        mpi_statuses = DCArrayKokkos<MPI_Status>(total_send_ghosts + total_recv_ghosts, "mpi_statuses");
-        persistent_send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "persistent_send_requests");
-        persistent_recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "persistent_recv_requests");
-        send_counts = DCArrayKokkos<size_t>(num_send_ranks, "send_counts");
-        send_displs = DCArrayKokkos<size_t>(num_send_ranks, "send_displs");
-        recv_counts = DCArrayKokkos<size_t>(num_recv_ranks, "recv_counts");
-        recv_displs = DCArrayKokkos<size_t>(num_recv_ranks, "recv_displs");
+//         send_rank_ids = DCArrayKokkos<size_t>(num_send_ranks, "send_rank_ids");
+//         recv_rank_ids = DCArrayKokkos<size_t>(num_recv_ranks, "recv_rank_ids");
+//         send_ghost_offsets = DCArrayKokkos<size_t>(num_send_ranks + 1, "send_ghost_offsets");
+//         recv_ghost_offsets = DCArrayKokkos<size_t>(num_recv_ranks + 1, "recv_ghost_offsets");
+//         send_ghost_lids = DCArrayKokkos<size_t>(total_send_ghosts, "send_ghost_lids");
+//         recv_ghost_lids = DCArrayKokkos<size_t>(total_recv_ghosts, "recv_ghost_lids");
+//         send_ghost_gids = std::vector<size_t>(total_send_ghosts, "send_ghost_gids");
+//         recv_ghost_gids = std::vector<size_t>(total_recv_ghosts, "recv_ghost_gids");
+//         send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "send_requests");
+//         recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "recv_requests");
+//         mpi_statuses = DCArrayKokkos<MPI_Status>(total_send_ghosts + total_recv_ghosts, "mpi_statuses");
+//         persistent_send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "persistent_send_requests");
+//         persistent_recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "persistent_recv_requests");
+//         send_counts = DCArrayKokkos<size_t>(num_send_ranks, "send_counts");
+//         send_displs = DCArrayKokkos<size_t>(num_send_ranks, "send_displs");
+//         recv_counts = DCArrayKokkos<size_t>(num_recv_ranks, "recv_counts");
+//         recv_displs = DCArrayKokkos<size_t>(num_recv_ranks, "recv_displs");
         
-    }
+//     }
     
 
     
-    // ========================================================================
-    // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES
-    // ========================================================================
+//     // ========================================================================
+//     // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES
+//     // ========================================================================
     
-    /**
-     * @brief Create distributed graph communicator from communication pattern
-     */
-    inline void create_graph_communicator(MPI_Comm base_comm) {
+//     /**
+//      * @brief Create distributed graph communicator from communication pattern
+//      */
+//     inline void create_graph_communicator(MPI_Comm base_comm) {
         
-        if (has_graph_comm) {
-            std::cerr << "Warning: Graph communicator already created, skipping." << std::endl;
-            return;
-        }
+//         if (has_graph_comm) {
+//             std::cerr << "Warning: Graph communicator already created, skipping." << std::endl;
+//             return;
+//         }
         
-        int indegree = num_recv_ranks;   // Number of ranks we receive FROM
-        int outdegree = num_send_ranks;  // Number of ranks we send TO
+//         int indegree = num_recv_ranks;   // Number of ranks we receive FROM
+//         int outdegree = num_send_ranks;  // Number of ranks we send TO
         
-        // Create the distributed graph communicator
-        // MPI_Dist_graph_create_adjacent signature:
-        //   (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights,
-        //    info, reorder, comm_dist_graph)
-        int reorder = 0;  // Don't reorder ranks (keep same as base_comm)
+//         // Create the distributed graph communicator
+//         // MPI_Dist_graph_create_adjacent signature:
+//         //   (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights,
+//         //    info, reorder, comm_dist_graph)
+//         int reorder = 0;  // Don't reorder ranks (keep same as base_comm)
         
-        MPI_Dist_graph_create_adjacent(
-            base_comm,                    // Base communicator
-            indegree,                     // We receive from num_recv_ranks neighbors
-            recv_rank_ids.data(),         // Source ranks (we receive from these)
-            MPI_UNWEIGHTED,               // No edge weights for sources
-            outdegree,                    // We send to num_send_ranks neighbors
-            send_rank_ids.data(),         // Destination ranks (we send to these)
-            MPI_UNWEIGHTED,               // No edge weights for destinations
-            MPI_INFO_NULL,                // No special hints
-            reorder,                      // Don't reorder ranks
-            &graph_comm                   // Output: new graph communicator
-        );
+//         MPI_Dist_graph_create_adjacent(
+//             base_comm,                    // Base communicator
+//             indegree,                     // We receive from num_recv_ranks neighbors
+//             recv_rank_ids.data(),         // Source ranks (we receive from these)
+//             MPI_UNWEIGHTED,               // No edge weights for sources
+//             outdegree,                    // We send to num_send_ranks neighbors
+//             send_rank_ids.data(),         // Destination ranks (we send to these)
+//             MPI_UNWEIGHTED,               // No edge weights for destinations
+//             MPI_INFO_NULL,                // No special hints
+//             reorder,                      // Don't reorder ranks
+//             &graph_comm                   // Output: new graph communicator
+//         );
         
-        has_graph_comm = true;
+//         has_graph_comm = true;
         
-        // Pre-allocate counts and displacements arrays
-        send_counts.resize(num_send_ranks);
-        send_displs.resize(num_send_ranks);
-        recv_counts.resize(num_recv_ranks);
-        recv_displs.resize(num_recv_ranks);
-    }
+//         // Pre-allocate counts and displacements arrays
+//         send_counts.resize(num_send_ranks);
+//         send_displs.resize(num_send_ranks);
+//         recv_counts.resize(num_recv_ranks);
+//         recv_displs.resize(num_recv_ranks);
+//     }
     
     
-    /**
-     * @brief Exchange ghost data using MPI_Neighbor_alltoallv
-     */
-    inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) {
+//     /**
+//      * @brief Exchange ghost data using MPI_Neighbor_alltoallv
+//      */
+//     inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) {
         
-        if (!has_graph_comm) {
-            std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
-            return;
-        }
+//         if (!has_graph_comm) {
+//             std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
+//             return;
+//         }
         
-        // 1. Pack send buffer from owned items
-        int total_send = send_ghost_lids.size();
-        ghost_send_buffer.resize(total_send * num_fields);
+//         // 1. Pack send buffer from owned items
+//         int total_send = send_ghost_lids.size();
+//         ghost_send_buffer.resize(total_send * num_fields);
         
-        for (size_t i = 0; i < send_ghost_lids.size(); i++) {
-            int local_id = send_ghost_lids[i];
-            for (int f = 0; f < num_fields; f++) {
-                ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f];
-            }
-        }
+//         for (size_t i = 0; i < send_ghost_lids.size(); i++) {
+//             int local_id = send_ghost_lids[i];
+//             for (int f = 0; f < num_fields; f++) {
+//                 ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f];
+//             }
+//         }
         
-        // 2. Update counts and displacements for this num_fields
-        for (int i = 0; i < num_send_ranks; i++) {
-            int start_idx = send_ghost_offsets[i];
-            int end_idx = send_ghost_offsets[i + 1];
-            send_counts[i] = (end_idx - start_idx) * num_fields;
-            send_displs[i] = start_idx * num_fields;
-        }
+//         // 2. Update counts and displacements for this num_fields
+//         for (int i = 0; i < num_send_ranks; i++) {
+//             int start_idx = send_ghost_offsets[i];
+//             int end_idx = send_ghost_offsets[i + 1];
+//             send_counts[i] = (end_idx - start_idx) * num_fields;
+//             send_displs[i] = start_idx * num_fields;
+//         }
         
-        int total_recv = recv_ghost_lids.size();
-        ghost_recv_buffer.resize(total_recv * num_fields);
+//         int total_recv = recv_ghost_lids.size();
+//         ghost_recv_buffer.resize(total_recv * num_fields);
         
-        for (int i = 0; i < num_recv_ranks; i++) {
-            int start_idx = recv_ghost_offsets[i];
-            int end_idx = recv_ghost_offsets[i + 1];
-            recv_counts[i] = (end_idx - start_idx) * num_fields;
-            recv_displs[i] = start_idx * num_fields;
-        }
+//         for (int i = 0; i < num_recv_ranks; i++) {
+//             int start_idx = recv_ghost_offsets[i];
+//             int end_idx = recv_ghost_offsets[i + 1];
+//             recv_counts[i] = (end_idx - start_idx) * num_fields;
+//             recv_displs[i] = start_idx * num_fields;
+//         }
         
-        // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm)
-        // MPI_Neighbor_alltoallv signature:
-        //   (sendbuf, sendcounts[], sdispls[], sendtype,
-        //    recvbuf, recvcounts[], rdispls[], recvtype, comm)
-        MPI_Neighbor_alltoallv(
-            ghost_send_buffer.data(),    // Send buffer
-            send_counts.data(),          // Send counts per neighbor
-            send_displs.data(),          // Send displacements
-            MPI_DOUBLE,                  // Send type
-            ghost_recv_buffer.data(),    // Receive buffer
-            recv_counts.data(),          // Receive counts per neighbor
-            recv_displs.data(),          // Receive displacements
-            MPI_DOUBLE,                  // Receive type
-            graph_comm                   // Graph communicator (NOT MPI_COMM_WORLD!)
-        );
+//         // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm)
+//         // MPI_Neighbor_alltoallv signature:
+//         //   (sendbuf, sendcounts[], sdispls[], sendtype,
+//         //    recvbuf, recvcounts[], rdispls[], recvtype, comm)
+//         MPI_Neighbor_alltoallv(
+//             ghost_send_buffer.data(),    // Send buffer
+//             send_counts.data(),          // Send counts per neighbor
+//             send_displs.data(),          // Send displacements
+//             MPI_DOUBLE,                  // Send type
+//             ghost_recv_buffer.data(),    // Receive buffer
+//             recv_counts.data(),          // Receive counts per neighbor
+//             recv_displs.data(),          // Receive displacements
+//             MPI_DOUBLE,                  // Receive type
+//             graph_comm                   // Graph communicator (NOT MPI_COMM_WORLD!)
+//         );
         
-        // 4. Unpack receive buffer into ghost items
-        for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
-            int ghost_local_id = recv_ghost_lids[i];
-            for (int f = 0; f < num_fields; f++) {
-                data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f];
-            }
-        }
-    }
+//         // 4. Unpack receive buffer into ghost items
+//         for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
+//             int ghost_local_id = recv_ghost_lids[i];
+//             for (int f = 0; f < num_fields; f++) {
+//                 data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f];
+//             }
+//         }
+//     }
     
     
-    /**
-     * @brief Initialize persistent neighborhood collective (MPI-4.0+)
-     */
-    inline void init_persistent_neighborhood(int num_fields) {
+//     /**
+//      * @brief Initialize persistent neighborhood collective (MPI-4.0+)
+//      */
+//     inline void init_persistent_neighborhood(int num_fields) {
         
-        if (!has_graph_comm) {
-            std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
-            return;
-        }
+//         if (!has_graph_comm) {
+//             std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
+//             return;
+//         }
         
-        if (has_persistent_neighbor) {
-            std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl;
-            free_persistent_neighborhood();
-        }
+//         if (has_persistent_neighbor) {
+//             std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl;
+//             free_persistent_neighborhood();
+//         }
         
-        persistent_num_fields = num_fields;
+//         persistent_num_fields = num_fields;
         
-        // Allocate buffers
-        int total_send = send_ghost_lids.size();
-        int total_recv = recv_ghost_lids.size();
-        ghost_send_buffer.resize(total_send * num_fields);
-        ghost_recv_buffer.resize(total_recv * num_fields);
+//         // Allocate buffers
+//         int total_send = send_ghost_lids.size();
+//         int total_recv = recv_ghost_lids.size();
+//         ghost_send_buffer.resize(total_send * num_fields);
+//         ghost_recv_buffer.resize(total_recv * num_fields);
         
-        // Setup counts and displacements for persistent request
-        for (int i = 0; i < num_send_ranks; i++) {
-            int start_idx = send_ghost_offsets[i];
-            int end_idx = send_ghost_offsets[i + 1];
-            send_counts[i] = (end_idx - start_idx) * num_fields;
-            send_displs[i] = start_idx * num_fields;
-        }
+//         // Setup counts and displacements for persistent request
+//         for (int i = 0; i < num_send_ranks; i++) {
+//             int start_idx = send_ghost_offsets[i];
+//             int end_idx = send_ghost_offsets[i + 1];
+//             send_counts[i] = (end_idx - start_idx) * num_fields;
+//             send_displs[i] = start_idx * num_fields;
+//         }
         
-        for (int i = 0; i < num_recv_ranks; i++) {
-            int start_idx = recv_ghost_offsets[i];
-            int end_idx = recv_ghost_offsets[i + 1];
-            recv_counts[i] = (end_idx - start_idx) * num_fields;
-            recv_displs[i] = start_idx * num_fields;
-        }
+//         for (int i = 0; i < num_recv_ranks; i++) {
+//             int start_idx = recv_ghost_offsets[i];
+//             int end_idx = recv_ghost_offsets[i + 1];
+//             recv_counts[i] = (end_idx - start_idx) * num_fields;
+//             recv_displs[i] = start_idx * num_fields;
+//         }
         
-#if MPI_VERSION >= 4
-        // MPI-4.0+ persistent neighborhood collective
-        // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request):
-        //   (sendbuf, sendcounts[], sdispls[], sendtype,
-        //    recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request)
-        MPI_Neighbor_alltoallv_init(
-            ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE,
-            ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE,
-            graph_comm,
-            MPI_INFO_NULL,
-            &persistent_neighbor_request
-        );
-        has_persistent_neighbor = true;
-#else
-        int rank;
-        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-        if (rank == 0) {
-            std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl;
-            std::cerr << "         Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl;
-            std::cerr << "         Will fall back to standard neighborhood collective" << std::endl;
-        }
-        has_persistent_neighbor = false;
-#endif
-    }
+// #if MPI_VERSION >= 4
+//         // MPI-4.0+ persistent neighborhood collective
+//         // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request):
+//         //   (sendbuf, sendcounts[], sdispls[], sendtype,
+//         //    recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request)
+//         MPI_Neighbor_alltoallv_init(
+//             ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE,
+//             ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE,
+//             graph_comm,
+//             MPI_INFO_NULL,
+//             &persistent_neighbor_request
+//         );
+//         has_persistent_neighbor = true;
+// #else
+//         int rank;
+//         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+//         if (rank == 0) {
+//             std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl;
+//             std::cerr << "         Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl;
+//             std::cerr << "         Will fall back to standard neighborhood collective" << std::endl;
+//         }
+//         has_persistent_neighbor = false;
+// #endif
+//     }
     
     
-    /**
-     * @brief Exchange ghosts using persistent neighborhood collective (FASTEST)
-     */
-    inline void exchange_ghosts_persistent(double* data_ptr) {
+//     /**
+//      * @brief Exchange ghosts using persistent neighborhood collective (FASTEST)
+//      */
+//     inline void exchange_ghosts_persistent(double* data_ptr) {
         
-#if MPI_VERSION >= 4
-        if (!has_persistent_neighbor) {
-            std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl;
-            std::cerr << "       Falling back to standard neighborhood collective..." << std::endl;
-            exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
-            return;
-        }
+// #if MPI_VERSION >= 4
+//         if (!has_persistent_neighbor) {
+//             std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl;
+//             std::cerr << "       Falling back to standard neighborhood collective..." << std::endl;
+//             exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
+//             return;
+//         }
         
-        // 1. Pack send buffer (same memory location as during init)
-        for (size_t i = 0; i < send_ghost_lids.size(); i++) {
-            int local_id = send_ghost_lids[i];
-            for (int f = 0; f < persistent_num_fields; f++) {
-                ghost_send_buffer[i * persistent_num_fields + f] = 
-                    data_ptr[local_id * persistent_num_fields + f];
-            }
-        }
+//         // 1. Pack send buffer (same memory location as during init)
+//         for (size_t i = 0; i < send_ghost_lids.size(); i++) {
+//             int local_id = send_ghost_lids[i];
+//             for (int f = 0; f < persistent_num_fields; f++) {
+//                 ghost_send_buffer[i * persistent_num_fields + f] = 
+//                     data_ptr[local_id * persistent_num_fields + f];
+//             }
+//         }
         
-        // 2. Start persistent request (VERY fast - no setup overhead)
-        MPI_Start(&persistent_neighbor_request);
+//         // 2. Start persistent request (VERY fast - no setup overhead)
+//         MPI_Start(&persistent_neighbor_request);
         
-        // 3. Wait for completion
-        MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE);
+//         // 3. Wait for completion
+//         MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE);
         
-        // 4. Unpack receive buffer
-        for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
-            int ghost_id = recv_ghost_lids[i];
-            for (int f = 0; f < persistent_num_fields; f++) {
-                data_ptr[ghost_id * persistent_num_fields + f] = 
-                    ghost_recv_buffer[i * persistent_num_fields + f];
-            }
-        }
-#else
-        // Fallback to standard method if MPI-4 not available
-        exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
-#endif
-    }
+//         // 4. Unpack receive buffer
+//         for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
+//             int ghost_id = recv_ghost_lids[i];
+//             for (int f = 0; f < persistent_num_fields; f++) {
+//                 data_ptr[ghost_id * persistent_num_fields + f] = 
+//                     ghost_recv_buffer[i * persistent_num_fields + f];
+//             }
+//         }
+// #else
+//         // Fallback to standard method if MPI-4 not available
+//         exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
+// #endif
+//     }
     
     
-    /**
-     * @brief Free persistent neighborhood collective resources
-     */
-    inline void free_persistent_neighborhood() {
-#if MPI_VERSION >= 4
-        if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
-            MPI_Request_free(&persistent_neighbor_request);
-            persistent_neighbor_request = MPI_REQUEST_NULL;
-            has_persistent_neighbor = false;
-        }
-#endif
-    }
+//     /**
+//      * @brief Free persistent neighborhood collective resources
+//      */
+//     inline void free_persistent_neighborhood() {
+// #if MPI_VERSION >= 4
+//         if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
+//             MPI_Request_free(&persistent_neighbor_request);
+//             persistent_neighbor_request = MPI_REQUEST_NULL;
+//             has_persistent_neighbor = false;
+//         }
+// #endif
+//     }
     
-};
+// };
 
 
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 9c4267bf..752b39e6 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -16,8 +16,6 @@
 #include "mesh_io.h"
 
 
-#include "communication_plan.h"
-
 // Include Scotch headers
 #include "scotch.h"
 #include "ptscotch.h"
@@ -32,6 +30,7 @@ void partition_mesh(
     Mesh_t& final_mesh,
     node_t& initial_node,
     node_t& final_node,
+    GaussPoint_t& gauss_point,
     int world_size,
     int rank){
 
@@ -1675,6 +1674,56 @@ void partition_mesh(
         extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
     }
 
+    // Build array: for each ghost element, store which rank owns it (where to receive data from)
+    std::vector<int> ghost_elem_owner_ranks(ghost_elem_gids_ordered.size());
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
+        size_t ghost_gid = ghost_elem_gids_ordered[i];
+        auto it = elem_gid_to_rank.find(ghost_gid);
+        if (it != elem_gid_to_rank.end()) {
+            ghost_elem_owner_ranks[i] = it->second;
+        } else {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid 
+                      << " not found in elem_gid_to_rank map!" << std::endl;
+            ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator
+        }
+    }
+
+    // Optional: Print ghost element receive pattern
+    if (print_info) {
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Ghost element receive pattern:" << std::endl;
+                for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
+                    size_t ghost_ext_lid = intermediate_mesh.num_elems + i;
+                    std::cout << "  Ghost elem ext_lid=" << ghost_ext_lid 
+                              << " gid=" << ghost_elem_gids_ordered[i]
+                              << " receives from rank " << ghost_elem_owner_ranks[i] << std::endl;
+                }
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+    }
+
+    // Create a std::set of all the ranks this rank will receive data from
+    std::set<int> ghost_elem_receive_ranks;
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
+        ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]);
+    }
+
+
+    // Print with ranks this rank will receive element data from sequentially
+    for (int r = 0; r < world_size; ++r) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "[rank " << rank << "] Ranks this rank will receive element data from: ";
+            for (int rank : ghost_elem_receive_ranks) {
+                std::cout << rank << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
 
 
 // ****************************************************************************************** 
@@ -1792,6 +1841,7 @@ void partition_mesh(
                    all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
 
+
     // d) Global coords (size: total_owned x 3)
     std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
     for (int i=0; i<local_owned_count; ++i) {
@@ -1842,13 +1892,14 @@ void partition_mesh(
     final_node.coords.update_device();
 
 
-    // --------------------------------------------------------------------------------------
-    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
-    // Steps:
-    // 1) Each rank contributes its ghost element GIDs.
-    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
-    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
-    // --------------------------------------------------------------------------------------
+// --------------------------------------------------------------------------------------
+// Build the send patterns for elements
+// Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
+// Steps:
+// 1) Each rank contributes its ghost element GIDs.
+// 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
+// 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
+// --------------------------------------------------------------------------------------
     std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(intermediate_mesh.num_elems);
 
     // Prepare local ghost list as vector
@@ -1909,8 +1960,6 @@ void partition_mesh(
     std::cout.flush();
     MPI_Barrier(MPI_COMM_WORLD);
 
-    
-    
 
     // Optional: print a compact summary of reverse map for verification (limited output)
     for(int i = 0; i < world_size; i++) {
@@ -1982,7 +2031,7 @@ void partition_mesh(
         MPI_Barrier(MPI_COMM_WORLD);
         if (rank == r) {
             std::cout << std::endl;
-            std::cout << "[rank " << rank << "] communicates to ranks: ";
+            std::cout << "[rank " << rank << "] elements communicates to ranks: ";
             for (int i = 0; i < num_ghost_comm_ranks; ++i) {
                 std::cout << ghost_comm_ranks_vec[i] << " ";
             }
@@ -1991,6 +2040,8 @@ void partition_mesh(
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
+    print_info = false;
+
     // Print out the boundary element local ids on each rank sequentially
     for (int r = 0; r < world_size; ++r) {
         MPI_Barrier(MPI_COMM_WORLD);
@@ -2009,31 +2060,415 @@ void partition_mesh(
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
+
+    final_mesh.num_boundary_elems = boundary_elem_local_ids.size();
+    final_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(final_mesh.num_boundary_elems);
+    for (int i = 0; i < final_mesh.num_boundary_elems; i++) {
+        final_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
+    }
+    final_mesh.boundary_elem_local_ids.update_device();
+
     print_info = false;
 
     
     MPI_Barrier(MPI_COMM_WORLD);
 
 
+// ****************************************************************************************** 
+//     Create MPI distributed graph communicator for element communication
+// ****************************************************************************************** 
+    // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
+    // that efficiently represents the communication pattern between ranks.
+    // This allows MPI to optimize communication based on the actual connectivity pattern.
+    
+    // ---------- Prepare input communicator ----------
+    // comm_old: The base communicator from which to create the graph communicator
+    MPI_Comm comm_old = MPI_COMM_WORLD;
+    
+    // ---------- Prepare INCOMING edges (sources) ----------
+    // indegree: Number of ranks from which this rank will RECEIVE data
+    // These are the ranks that own elements which are ghosted on this rank
+    std::vector<int> ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), 
+                                                    ghost_elem_receive_ranks.end());
+    // The number of ranks from which this rank will receive data (incoming neighbors)
+    int indegree = static_cast<int>(ghost_elem_receive_ranks_vec.size());
+    
+    // sources: Array of source rank IDs (ranks we receive from)
+    // Each element corresponds to a rank that owns elements we ghost
+    int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+    
+    // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* sourceweights = MPI_UNWEIGHTED;
+    
+    // ---------- Prepare OUTGOING edges (destinations) ----------
+    // outdegree: Number of ranks to which this rank will SEND data
+    // These are the ranks that ghost elements owned by this rank
+    int outdegree = num_ghost_comm_ranks;
+    
+    // destinations: Array of destination rank IDs (ranks we send to)
+    // Each element corresponds to a rank that ghosts our owned elements
+    int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED;
+    
+    // destweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* destweights = MPI_UNWEIGHTED;
+    
+    // ---------- Additional parameters ----------
+    // info: Hints for optimization (MPI_INFO_NULL means use defaults)
+    MPI_Info info = MPI_INFO_NULL;
+    
+    // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering)
+    // Setting to 0 preserves original rank numbering
+    int reorder = 0;
+    
+    // ---------- Output communicator ----------
+    // graph_comm: The new distributed graph communicator that will be created
+    MPI_Comm graph_comm;
+    
+    // Create the distributed graph communicator
+    // This call collectively creates a communicator where each rank specifies:
+    //   - Which ranks it receives from (sources/indegree)
+    //   - Which ranks it sends to (destinations/outdegree)
+    // MPI can then optimize collective operations and point-to-point communication
+    // based on this connectivity information.
+    MPI_Dist_graph_create_adjacent(
+        comm_old,           // Input: base communicator
+        indegree,           // Input: number of incoming neighbors (ranks we receive from)
+        sources,            // Input: array of source ranks [indegree elements]
+        sourceweights,      // Input: weights on incoming edges (MPI_UNWEIGHTED)
+        outdegree,          // Input: number of outgoing neighbors (ranks we send to)
+        destinations,       // Input: array of destination ranks [outdegree elements]
+        destweights,        // Input: weights on outgoing edges (MPI_UNWEIGHTED)
+        info,               // Input: optimization hints (MPI_INFO_NULL)
+        reorder,            // Input: allow rank reordering (0=no)
+        &graph_comm         // Output: new distributed graph communicator
+    );
+
+    // Optional: Verify the graph communicator was created successfully
+    if (rank == 0) {
+        std::cout << " Created MPI distributed graph communicator for element communication" << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ============================================================================
+    // Verify the distributed graph communicator
+    // ============================================================================
+    // Query the graph to verify it matches what we specified
+    int indegree_out, outdegree_out, weighted;
+    MPI_Dist_graph_neighbors_count(graph_comm, &indegree_out, &outdegree_out, &weighted);
+    
+    // Allocate arrays to receive neighbor information
+    std::vector<int> sources_out(indegree_out);
+    std::vector<int> sourceweights_out(indegree_out);
+    std::vector<int> destinations_out(outdegree_out);
+    std::vector<int> destweights_out(outdegree_out);
+    
+    // Retrieve the actual neighbors from the graph communicator
+    MPI_Dist_graph_neighbors(graph_comm, 
+                             indegree_out, sources_out.data(), sourceweights_out.data(),
+                             outdegree_out, destinations_out.data(), destweights_out.data());
+    
+    // Print verification information for each rank sequentially
+    for (int r = 0; r < world_size; ++r) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
+            std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
+            for (int i = 0; i < indegree_out; ++i) {
+                std::cout << sources_out[i] << " ";
+            }
+            std::cout << std::endl;
+            
+            std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
+            for (int i = 0; i < outdegree_out; ++i) {
+                std::cout << destinations_out[i] << " ";
+            }
+            std::cout << std::endl;
+            
+            std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+    // Additional verification: Check if the queried values match our input
+    bool verification_passed = true;
+    if (indegree_out != indegree) {
+        std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
+                  << "Expected " << indegree << ", got " << indegree_out << std::endl;
+        verification_passed = false;
+    }
+    if (outdegree_out != outdegree) {
+        std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
+                  << "Expected " << outdegree << ", got " << outdegree_out << std::endl;
+        verification_passed = false;
+    }
+    
+    // Check if source and destination ranks match (order may differ)
+    std::set<int> sources_set_in(ghost_elem_receive_ranks_vec.begin(), ghost_elem_receive_ranks_vec.end());
+    std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
+    if (sources_set_in != sources_set_out) {
+        std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
+        verification_passed = false;
+    }
+    
+    std::set<int> dests_set_in(ghost_comm_ranks_vec.begin(), ghost_comm_ranks_vec.end());
+    std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
+    if (dests_set_in != dests_set_out) {
+        std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
+        verification_passed = false;
+    }
+    
+    // Global verification check
+    int local_passed = verification_passed ? 1 : 0;
+    int global_passed = 0;
+    MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) {
+        if (global_passed) {
+            std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
+        } else {
+            std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    // Build communication plans for elements 
-    CommunicationPlan element_comm_plan;
 
 
-    element_comm_plan.initialize(num_send_ranks, num_recv_ranks);
 
 
+// ****************************************************************************************** 
+//     Test element communication using MPI_Neighbor_alltoallv
+// ****************************************************************************************** 
+    // Gauss points share the same communication plan as elements.
+    // This test initializes gauss point fields on owned elements and exchanges them with ghost elements.
+
+    print_info = true;  // Enable debug output for communication test
+
+    gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields});
+
+    // Initialize the gauss point fields on each rank
+    // Set owned elements to rank number, ghost elements to -1 (to verify communication)
+    for (int i = 0; i < final_mesh.num_owned_elems; i++) {
+        gauss_point.fields.host(i) = static_cast<double>(rank);
+    }
+    for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
+        gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
+    }
+    gauss_point.fields.update_device();
+
+    // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
+    // For MPI_Neighbor_alltoallv with graph communicator:
+    //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
+    //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
+    
+    std::vector<int> elem_sendcounts(outdegree_out, 0);
+    std::vector<int> elem_sdispls(outdegree_out, 0);
+    
+    // Count how many boundary elements go to each destination rank
+    // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element
+    std::map<int, std::vector<int>> elems_to_send_by_rank;  // rank -> list of boundary element local IDs
+    
+    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+        if (!boundary_elem_targets[elem_lid].empty()) {
+            for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                int dest_rank = pr.first;
+                elems_to_send_by_rank[dest_rank].push_back(elem_lid);
+            }
+        }
+    }
+    
+    // Fill elem_sendcounts based on the graph communicator's destination order
+    int total_send = 0;
+    for (int i = 0; i < outdegree_out; i++) {
+        int dest_rank = destinations_out[i];
+        elem_sendcounts[i] = static_cast<int>(elems_to_send_by_rank[dest_rank].size());
+        elem_sdispls[i] = total_send;
+        total_send += elem_sendcounts[i];
+    }
+    
+    // Debug: Print send counts
+    if (print_info) {
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Send counts: ";
+                for (int i = 0; i < outdegree_out; i++) {
+                    std::cout << "to_rank_" << destinations_out[i] << "=" << elem_sendcounts[i] << " ";
+                }
+                std::cout << "(total=" << total_send << ")" << std::endl;
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+    }
+    
+    // ========== Build receive counts and displacements for INCOMING neighbors (sources) ==========
+    //   - elem_recvcounts[i] = number of elements to receive from i-th incoming neighbor (sources_out[i])
+    //   - elem_rdispls[i] = starting position in recv buffer for i-th incoming neighbor
+    
+    std::vector<int> elem_recvcounts(indegree_out, 0);
+    std::vector<int> elem_rdispls(indegree_out, 0);
+    
+    // Count how many ghost elements come from each source rank
+    // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element
+    std::map<int, std::vector<int>> elems_to_recv_by_rank;  // rank -> list of ghost element indices
+    
+    for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) {
+        int source_rank = ghost_elem_owner_ranks[i];
+        elems_to_recv_by_rank[source_rank].push_back(static_cast<int>(i));
+    }
+    
+    // Fill elem_recvcounts based on the graph communicator's source order
+    int total_recv = 0;
+    for (int i = 0; i < indegree_out; i++) {
+        int source_rank = sources_out[i];
+        elem_recvcounts[i] = static_cast<int>(elems_to_recv_by_rank[source_rank].size());
+        elem_rdispls[i] = total_recv;
+        total_recv += elem_recvcounts[i];
+    }
+    
+    // Debug: Print receive counts
+    if (print_info) {
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Recv counts: ";
+                for (int i = 0; i < indegree_out; i++) {
+                    std::cout << "from_rank_" << sources_out[i] << "=" << elem_recvcounts[i] << " ";
+                }
+                std::cout << "(total=" << total_recv << ", expected_ghosts=" << final_mesh.num_ghost_elems << ")" << std::endl;
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+    }
+    
+    // ========== Build send buffer organized by destination rank ==========
+    std::vector<double> elem_send_buffer(total_send);
+    int send_idx = 0;
+    
+    for (int i = 0; i < outdegree_out; i++) {
+        int dest_rank = destinations_out[i];
+        const auto& elems_for_this_rank = elems_to_send_by_rank[dest_rank];
+        
+        for (int elem_lid : elems_for_this_rank) {
+            elem_send_buffer[send_idx++] = gauss_point.fields.host(elem_lid);
+        }
+    }
+    
+    // ========== Allocate receive buffer ==========
+    std::vector<double> elem_recv_buffer(total_recv);
+    
+    // ========== Exchange data using MPI_Neighbor_alltoallv ==========
+    // MPI_Neighbor_alltoallv exchanges data with neighbors in the graph communicator topology
+    // - elem_sendcounts[i]: number of doubles to send to i-th outgoing neighbor
+    // - elem_recvcounts[i]: number of doubles to receive from i-th incoming neighbor
+    // - The order of neighbors must match the order returned by MPI_Dist_graph_neighbors
+    
+    MPI_Neighbor_alltoallv(
+        elem_send_buffer.data(),   // Send buffer with boundary element data
+        elem_sendcounts.data(),    // Number of elements to send to each outgoing neighbor [outdegree]
+        elem_sdispls.data(),       // Displacement in send buffer for each outgoing neighbor [outdegree]
+        MPI_DOUBLE,                // Send data type
+        elem_recv_buffer.data(),   // Receive buffer for ghost element data
+        elem_recvcounts.data(),    // Number of elements to receive from each incoming neighbor [indegree]
+        elem_rdispls.data(),       // Displacement in recv buffer for each incoming neighbor [indegree]
+        MPI_DOUBLE,                // Receive data type
+        graph_comm                 // Distributed graph communicator
+    );
+    
+    // ========== Update ghost element fields from receive buffer ==========
+    // Unpack received data back into ghost elements in the correct order
+    
+    // Track which ghost elements have been updated for debugging
+    std::vector<bool> ghost_updated(final_mesh.num_ghost_elems, false);
+    
+    int recv_idx = 0;
+    for (int i = 0; i < indegree_out; i++) {
+        int source_rank = sources_out[i];
+        const auto& ghost_indices = elems_to_recv_by_rank[source_rank];
+        
+        for (int ghost_idx : ghost_indices) {
+            int ghost_elem_local_id = final_mesh.num_owned_elems + ghost_idx;
+            gauss_point.fields.host(ghost_elem_local_id) = elem_recv_buffer[recv_idx++];  
+            ghost_updated[ghost_idx] = true;
+        }
+    }
+    
+    // Debug: Check which ghosts weren't updated
+    if (print_info) {
+        std::vector<int> missing_ghosts;
+        for (size_t i = 0; i < ghost_updated.size(); i++) {
+            if (!ghost_updated[i]) {
+                missing_ghosts.push_back(static_cast<int>(i));
+            }
+        }
+        
+        if (!missing_ghosts.empty()) {
+            for (int r = 0; r < world_size; ++r) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                if (rank == r) {
+                    std::cout << "[rank " << rank << "] WARNING: " << missing_ghosts.size() 
+                              << " ghost elements not in elems_to_recv_by_rank: ";
+                    for (size_t i = 0; i < std::min(missing_ghosts.size(), size_t(10)); i++) {
+                        std::cout << missing_ghosts[i] << " ";
+                    }
+                    if (missing_ghosts.size() > 10) std::cout << "...";
+                    std::cout << std::endl;
+                }
+                MPI_Barrier(MPI_COMM_WORLD);
+            }
+        }
+    }
+    
+    gauss_point.fields.update_device();
+    
+    // ========== Verify the communication worked correctly ==========
+    bool comm_test_passed = true;
+    for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
+        if (gauss_point.fields.host(i) < 0.0) {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << i 
+                      << " was not updated (value = " << gauss_point.fields.host(i) << ")" << std::endl;
+            comm_test_passed = false;
+        }
+    }
+    
+    int local_test_passed = comm_test_passed ? 1 : 0;
+    int global_test_passed = 0;
+    MPI_Allreduce(&local_test_passed, &global_test_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) {
+        if (global_test_passed) {
+            std::cout << "\n✓ Element communication test PASSED on all ranks\n" << std::endl;
+        } else {
+            std::cout << "\n✗ Element communication test FAILED on one or more ranks\n" << std::endl;
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
 
+    print_info = false;  // Disable debug output after communication test
 
+    // Loop over all elements and average the values of elements connected to that element
+    for (int i = 0; i < final_mesh.num_elems; i++) {
+        double value = 0.0;
+        for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
+            value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j));
+        }
+        value /= final_mesh.num_elems_in_elem(i);
+        gauss_point.fields.host(i) = value;
+    }
+    gauss_point.fields.update_device();
 
+   
 
-    // --------------------------------------------------------------------------------------
-    // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
-    // Steps:
-    // 1) Each rank contributes its ghost node GIDs.
-    // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
-    // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
-    // --------------------------------------------------------------------------------------
+// --------------------------------------------------------------------------------------
+// Build the send pattern for nodes
+// Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
+// Steps:
+// 1) Each rank contributes its ghost node GIDs.
+// 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
+// 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
+// --------------------------------------------------------------------------------------
     
     std::vector<std::vector<std::pair<int, size_t>>> boundary_node_targets(intermediate_mesh.num_nodes);
     
@@ -2129,15 +2564,7 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
 
-
-
-
-    // Build communication plans for elements and nodes
-    CommunicationPlan element_comm_plan;
-    CommunicationPlan node_comm_plan;
-
-    element_comm_plan.build(intermediate_mesh, world_size, rank, boundary_elem_targets, boundary_elem_local_ids, boundary_to_ghost_ranks);
-    node_comm_plan.build(intermediate_mesh, world_size, rank, boundary_node_targets, boundary_node_local_ids, boundary_to_ghost_ranks);
+    
 
 
 }
diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
index a745e17e..01ad00c6 100644
--- a/examples/mesh_decomp/mesh.h
+++ b/examples/mesh_decomp/mesh.h
@@ -310,11 +310,17 @@ struct Mesh_t
     DCArrayKokkos<size_t> local_to_global_node_mapping; ///< Local to global node mapping
     DCArrayKokkos<size_t> local_to_global_elem_mapping; ///< Local to global element mapping
 
+    // Element communicaiton data definitions
     size_t num_owned_elems; ///< Number of owned elements on this rank
-    size_t num_ghost_elems; ///< Number of ghost elements on this rank (from neighboring MPI ranks)
+    size_t num_boundary_elems; ///< Number of boundary elements on this rank (send data to neighboring MPI ranks)
+    DCArrayKokkos<size_t> boundary_elem_local_ids; ///< Local IDs of boundary elements on this rank (send data to neighboring MPI ranks)
+    size_t num_ghost_elems; ///< Number of ghost elements on this rank (receive data from neighboring MPI ranks)
     
+    // Node communicaiton data definitions
     size_t num_owned_nodes; ///< Number of owned nodes on this rank
-    size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (from neighboring MPI ranks)
+    size_t num_boundary_nodes; ///< Number of boundary nodes on this rank (send data to neighboring MPI ranks)
+    DCArrayKokkos<size_t> boundary_node_local_ids; ///< Local IDs of boundary nodes on this rank (send data to neighboring MPI ranks)
+    size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (receive data from neighboring MPI ranks)
     
 
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 608c3867..b14ee9cd 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -44,6 +44,8 @@ int main(int argc, char** argv) {
     Mesh_t final_mesh;
     node_t final_node;
 
+    GaussPoint_t gauss_point;
+
 // ********************************************************  
 //              Build the initial mesh
 // ********************************************************  
@@ -60,11 +62,11 @@ int main(int argc, char** argv) {
 // ********************************************************  
 //             Partition and balance the mesh
 // ********************************************************  
-    partition_mesh(initial_mesh, final_mesh, initial_node, final_node, world_size, rank);
+    partition_mesh(initial_mesh, final_mesh, initial_node, final_node, gauss_point, world_size, rank);
 
     // write_vtk(intermediate_mesh, intermediate_node, rank);
     MPI_Barrier(MPI_COMM_WORLD);
-    write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD);
+    write_vtu(final_mesh, final_node, gauss_point, rank, MPI_COMM_WORLD);
     // write_vtk(final_mesh, final_node, rank);
     MPI_Barrier(MPI_COMM_WORLD);
 
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index f0801777..77dac8d0 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -497,6 +497,7 @@ void build_3d_box(
 /////////////////////////////////////////////////////////////////////////////
 void write_vtu(Mesh_t& mesh,
                node_t& node,
+               GaussPoint_t& gauss_point,
                int rank,
                MPI_Comm comm)
 {
@@ -511,7 +512,7 @@ void write_vtu(Mesh_t& mesh,
     node.coords.update_host();
     Kokkos::fence();
 
-    const int num_cell_scalar_vars = 3;
+    const int num_cell_scalar_vars = 4;
     const int num_cell_vec_vars    = 0;
     const int num_cell_tensor_vars = 0;
 
@@ -520,7 +521,7 @@ void write_vtu(Mesh_t& mesh,
 
     // Scalar values associated with a cell
     const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
-        "rank_id", "elems_in_elem_owned", "global_elem_id"
+        "rank_id", "elems_in_elem_owned", "global_elem_id", "field_value"
     };
 
     const char point_scalar_var_names[num_point_scalar_vars][15] = {
@@ -543,6 +544,7 @@ void write_vtu(Mesh_t& mesh,
         elem_fields(elem_gid, 0) = rank;
         elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
         elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
+        elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid);
     }
 
     // save the vertex vector fields to an array for exporting to graphics files
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index 5ba78be9..35b73985 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -1,360 +1,360 @@
-#ifndef MPIDARRAYKOKKOS_H
-#define MPIDARRAYKOKKOS_H
-
-#include "matar.h"
-#include "communication_plan.h"
-
-using namespace mtr;
-
-/////////////////////////
-// MPIDArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
-// 
-// Enhanced with automatic ghost synchronization via CommunicationPlan.
-// Allocates space for owned + ghost items and provides communicate() method.
-//
-// Usage:
-//   node.coords.communicate()  -> syncs ghost nodes automatically
-//   elem.density.communicate() -> syncs ghost elements automatically
-/////////////////////////
-template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class MPIDArrayKokkos {
-
-    // this is manage
-    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+// #ifndef MPIDARRAYKOKKOS_H
+// #define MPIDARRAYKOKKOS_H
+
+// #include "matar.h"
+// #include "communication_plan.h"
+
+// using namespace mtr;
+
+// /////////////////////////
+// // MPIDArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+// // 
+// // Enhanced with automatic ghost synchronization via CommunicationPlan.
+// // Allocates space for owned + ghost items and provides communicate() method.
+// //
+// // Usage:
+// //   node.coords.communicate()  -> syncs ghost nodes automatically
+// //   elem.density.communicate() -> syncs ghost elements automatically
+// /////////////////////////
+// template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+// class MPIDArrayKokkos {
+
+//     // this is manage
+//     using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
     
-protected:
-    size_t dims_[7];
-    size_t length_;
-    size_t order_;  // tensor order (rank)
-    int mpi_recv_rank_;
-    int mpi_tag_;
-    MPI_Comm mpi_comm_;
-    MPI_Status mpi_status_;
-    MPI_Datatype mpi_datatype_;
-    MPI_Request mpi_request_;
-    TArray1D this_array_;
+// protected:
+//     size_t dims_[7];
+//     size_t length_;
+//     size_t order_;  // tensor order (rank)
+//     int mpi_recv_rank_;
+//     int mpi_tag_;
+//     MPI_Comm mpi_comm_;
+//     MPI_Status mpi_status_;
+//     MPI_Datatype mpi_datatype_;
+//     MPI_Request mpi_request_;
+//     TArray1D this_array_;
     
-    // --- Ghost Communication Support ---
-    CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
-    size_t num_owned_items_;            // Number of owned items (nodes/elements)
-    size_t num_total_items_;            // Total items including ghosts (owned + ghost)
-    size_t num_fields_;                 // Fields per item (e.g., 3 for 3D coordinates)
+//     // --- Ghost Communication Support ---
+//     CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
+//     size_t num_owned_items_;            // Number of owned items (nodes/elements)
+//     size_t num_total_items_;            // Total items including ghosts (owned + ghost)
+//     size_t num_fields_;                 // Fields per item (e.g., 3 for 3D coordinates)
     
-    void set_mpi_type();
+//     void set_mpi_type();
 
-public:
-    // Data member to access host view
-    ViewCArray <T> host;
+// public:
+//     // Data member to access host view
+//     ViewCArray <T> host;
 
-    MPIDArrayKokkos();
+//     MPIDArrayKokkos();
     
-    MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+//                  size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+//                  size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+//                  size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, size_t dim5,
-                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+//                  size_t dim3, size_t dim4, size_t dim5,
+//                  size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
     
     
-    // ========================================================================
-    // DISTRIBUTED COMMUNICATION METHODS (NEW)
-    // ========================================================================
+//     // ========================================================================
+//     // DISTRIBUTED COMMUNICATION METHODS (NEW)
+//     // ========================================================================
     
-    /**
-     * @brief Set communication plan and ghost metadata
-     * 
-     * Call this ONCE after allocating the array to enable ghost communication.
-     * Multiple fields can share the same CommunicationPlan pointer.
-     * 
-     * @param plan Pointer to shared CommunicationPlan (node or element plan)
-     * @param num_owned Number of owned items on this rank
-     * @param num_total Total items including ghosts (owned + ghost)
-     * 
-     * Example:
-     *   node.coords = MPIDArrayKokkos<double>(num_total_nodes, 3);
-     *   node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes);
-     */
-    void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total);
+//     /**
+//      * @brief Set communication plan and ghost metadata
+//      * 
+//      * Call this ONCE after allocating the array to enable ghost communication.
+//      * Multiple fields can share the same CommunicationPlan pointer.
+//      * 
+//      * @param plan Pointer to shared CommunicationPlan (node or element plan)
+//      * @param num_owned Number of owned items on this rank
+//      * @param num_total Total items including ghosts (owned + ghost)
+//      * 
+//      * Example:
+//      *   node.coords = MPIDArrayKokkos<double>(num_total_nodes, 3);
+//      *   node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes);
+//      */
+//     void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total);
     
     
-    /**
-     * @brief Synchronize ghost data using neighborhood collectives
-     * 
-     * Automatically exchanges boundary → ghost data for this field.
-     * Uses the CommunicationPlan provided via set_communication_plan().
-     * 
-     * Workflow:
-     * 1. Updates host data from device (if needed)
-     * 2. Packs owned boundary items
-     * 3. Calls MPI_Neighbor_alltoallv (via comm_plan)
-     * 4. Unpacks into ghost items
-     * 5. Updates device with new ghost data
-     * 
-     * Example usage:
-     *   // Update owned nodes
-     *   for (int i = 0; i < num_owned_nodes; i++) {
-     *       node.coords(i, 0) += dt * velocity(i, 0);
-     *   }
-     *   
-     *   // Sync ghosts
-     *   node.coords.communicate();
-     *   
-     *   // Now ghost data is current
-     */
-    void communicate();
+//     /**
+//      * @brief Synchronize ghost data using neighborhood collectives
+//      * 
+//      * Automatically exchanges boundary → ghost data for this field.
+//      * Uses the CommunicationPlan provided via set_communication_plan().
+//      * 
+//      * Workflow:
+//      * 1. Updates host data from device (if needed)
+//      * 2. Packs owned boundary items
+//      * 3. Calls MPI_Neighbor_alltoallv (via comm_plan)
+//      * 4. Unpacks into ghost items
+//      * 5. Updates device with new ghost data
+//      * 
+//      * Example usage:
+//      *   // Update owned nodes
+//      *   for (int i = 0; i < num_owned_nodes; i++) {
+//      *       node.coords(i, 0) += dt * velocity(i, 0);
+//      *   }
+//      *   
+//      *   // Sync ghosts
+//      *   node.coords.communicate();
+//      *   
+//      *   // Now ghost data is current
+//      */
+//     void communicate();
     
     
-    /**
-     * @brief Non-blocking version: start ghost exchange
-     * 
-     * For advanced users who want to overlap computation with communication.
-     * Must call communicate_wait() before accessing ghost data.
-     */
-    void communicate_begin();
+//     /**
+//      * @brief Non-blocking version: start ghost exchange
+//      * 
+//      * For advanced users who want to overlap computation with communication.
+//      * Must call communicate_wait() before accessing ghost data.
+//      */
+//     void communicate_begin();
     
     
-    /**
-     * @brief Wait for non-blocking ghost exchange to complete
-     */
-    void communicate_wait();
+//     /**
+//      * @brief Wait for non-blocking ghost exchange to complete
+//      */
+//     void communicate_wait();
     
     
-    /**
-     * @brief Get number of owned items (excludes ghosts)
-     */
-    KOKKOS_INLINE_FUNCTION
-    size_t num_owned() const { return num_owned_items_; }
+//     /**
+//      * @brief Get number of owned items (excludes ghosts)
+//      */
+//     KOKKOS_INLINE_FUNCTION
+//     size_t num_owned() const { return num_owned_items_; }
     
     
-    /**
-     * @brief Get total items including ghosts
-     */
-    KOKKOS_INLINE_FUNCTION
-    size_t num_total() const { return num_total_items_; }
+//     /**
+//      * @brief Get total items including ghosts
+//      */
+//     KOKKOS_INLINE_FUNCTION
+//     size_t num_total() const { return num_total_items_; }
     
     
-    /**
-     * @brief Check if ghost communication is configured
-     */
-    bool has_communication_plan() const { return comm_plan_ != nullptr; }
+//     /**
+//      * @brief Check if ghost communication is configured
+//      */
+//     bool has_communication_plan() const { return comm_plan_ != nullptr; }
     
-    // These functions can setup the data needed for halo send/receives
-    // Not necessary for standard MPI comms
-    void mpi_setup();
+//     // These functions can setup the data needed for halo send/receives
+//     // Not necessary for standard MPI comms
+//     void mpi_setup();
 
-    void mpi_setup(int recv_rank);
+//     void mpi_setup(int recv_rank);
 
-    void mpi_setup(int recv_rank, int tag);
+//     void mpi_setup(int recv_rank, int tag);
 
-    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
+//     void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
 
-    void mpi_set_rank(int recv_rank);
+//     void mpi_set_rank(int recv_rank);
 
-    void mpi_set_tag(int tag);
+//     void mpi_set_tag(int tag);
 
-    void mpi_set_comm(MPI_Comm comm);
+//     void mpi_set_comm(MPI_Comm comm);
 
-    int get_rank();
+//     int get_rank();
 
-    int get_tag();
+//     int get_tag();
 
-    MPI_Comm get_comm();
+//     MPI_Comm get_comm();
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i) const;
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i, size_t j) const;
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i, size_t j, size_t k) const;
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i, size_t j, size_t k, size_t l) const;
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
-                  size_t n) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+//                   size_t n) const;
 
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
-                  size_t n, size_t o) const;
+//     KOKKOS_INLINE_FUNCTION
+//     T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+//                   size_t n, size_t o) const;
     
-    KOKKOS_INLINE_FUNCTION
-    MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
+//     KOKKOS_INLINE_FUNCTION
+//     MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
 
-    // GPU Method
-    // Method that returns size
-    KOKKOS_INLINE_FUNCTION
-    size_t size() const;
+//     // GPU Method
+//     // Method that returns size
+//     KOKKOS_INLINE_FUNCTION
+//     size_t size() const;
 
-    // Host Method
-    // Method that returns size
-    KOKKOS_INLINE_FUNCTION
-    size_t extent() const;
+//     // Host Method
+//     // Method that returns size
+//     KOKKOS_INLINE_FUNCTION
+//     size_t extent() const;
 
-    KOKKOS_INLINE_FUNCTION
-    size_t dims(size_t i) const;
+//     KOKKOS_INLINE_FUNCTION
+//     size_t dims(size_t i) const;
 
-    KOKKOS_INLINE_FUNCTION
-    size_t order() const;
+//     KOKKOS_INLINE_FUNCTION
+//     size_t order() const;
  
-    // Method returns the raw device pointer of the Kokkos DualView
-    KOKKOS_INLINE_FUNCTION
-    T* device_pointer() const;
+//     // Method returns the raw device pointer of the Kokkos DualView
+//     KOKKOS_INLINE_FUNCTION
+//     T* device_pointer() const;
 
-    // Method returns the raw host pointer of the Kokkos DualView
-    KOKKOS_INLINE_FUNCTION
-    T* host_pointer() const;
+//     // Method returns the raw host pointer of the Kokkos DualView
+//     KOKKOS_INLINE_FUNCTION
+//     T* host_pointer() const;
 
-    // Method returns kokkos dual view
-    KOKKOS_INLINE_FUNCTION
-    TArray1D get_kokkos_dual_view() const;
+//     // Method returns kokkos dual view
+//     KOKKOS_INLINE_FUNCTION
+//     TArray1D get_kokkos_dual_view() const;
 
-    // Method that update host view
-    void update_host();
+//     // Method that update host view
+//     void update_host();
 
-    // Method that update device view
-    void update_device();
+//     // Method that update device view
+//     void update_device();
 
     
 
-    // Deconstructor
-    virtual KOKKOS_INLINE_FUNCTION
-    ~MPIDArrayKokkos ();
-}; // End of MPIDArrayKokkos
-
-
-// ============================================================================
-// INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION
-// ============================================================================
-
-/**
- * @brief Default constructor - initialize ghost communication members
- */
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::MPIDArrayKokkos() 
-    : comm_plan_(nullptr), 
-      num_owned_items_(0), 
-      num_total_items_(0), 
-      num_fields_(0) 
-{
-    // Base constructor handles array initialization
-}
-
-
-/**
- * @brief Set communication plan and ghost metadata
- */
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::set_communication_plan(
-    CommunicationPlan* plan, 
-    size_t num_owned, 
-    size_t num_total)
-{
-    comm_plan_ = plan;
-    num_owned_items_ = num_owned;
-    num_total_items_ = num_total;
+//     // Deconstructor
+//     virtual KOKKOS_INLINE_FUNCTION
+//     ~MPIDArrayKokkos ();
+// }; // End of MPIDArrayKokkos
+
+
+// // ============================================================================
+// // INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION
+// // ============================================================================
+
+// /**
+//  * @brief Default constructor - initialize ghost communication members
+//  */
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// KOKKOS_INLINE_FUNCTION
+// MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::MPIDArrayKokkos() 
+//     : comm_plan_(nullptr), 
+//       num_owned_items_(0), 
+//       num_total_items_(0), 
+//       num_fields_(0) 
+// {
+//     // Base constructor handles array initialization
+// }
+
+
+// /**
+//  * @brief Set communication plan and ghost metadata
+//  */
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::set_communication_plan(
+//     CommunicationPlan* plan, 
+//     size_t num_owned, 
+//     size_t num_total)
+// {
+//     comm_plan_ = plan;
+//     num_owned_items_ = num_owned;
+//     num_total_items_ = num_total;
     
-    // Infer number of fields from array dimensions
-    // Assumption: dim0 = num_items, dim1+ = fields
-    if (order_ == 1) {
-        num_fields_ = 1;  // Scalar field
-    } else if (order_ == 2) {
-        num_fields_ = dims_[1];  // Vector field (e.g., coords[num_nodes, 3])
-    } else {
-        // For higher order tensors, treat everything after dim0 as fields
-        num_fields_ = 1;
-        for (size_t i = 1; i < order_; i++) {
-            num_fields_ *= dims_[i];
-        }
-    }
+//     // Infer number of fields from array dimensions
+//     // Assumption: dim0 = num_items, dim1+ = fields
+//     if (order_ == 1) {
+//         num_fields_ = 1;  // Scalar field
+//     } else if (order_ == 2) {
+//         num_fields_ = dims_[1];  // Vector field (e.g., coords[num_nodes, 3])
+//     } else {
+//         // For higher order tensors, treat everything after dim0 as fields
+//         num_fields_ = 1;
+//         for (size_t i = 1; i < order_; i++) {
+//             num_fields_ *= dims_[i];
+//         }
+//     }
     
-    // Validate dimensions match total items
-    if (dims_[0] != num_total) {
-        std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" 
-                  << num_total << ")" << std::endl;
-        std::cerr << "       Array must be allocated with size = num_owned + num_ghost" << std::endl;
-    }
-}
-
-
-/**
- * @brief Synchronize ghost data using neighborhood collectives
- */
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate()
-{
-    if (!comm_plan_) {
-        std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl;
-        return;
-    }
+//     // Validate dimensions match total items
+//     if (dims_[0] != num_total) {
+//         std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" 
+//                   << num_total << ")" << std::endl;
+//         std::cerr << "       Array must be allocated with size = num_owned + num_ghost" << std::endl;
+//     }
+// }
+
+
+// /**
+//  * @brief Synchronize ghost data using neighborhood collectives
+//  */
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate()
+// {
+//     if (!comm_plan_) {
+//         std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl;
+//         return;
+//     }
     
-    if (!comm_plan_->has_graph_comm) {
-        std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl;
-        std::cerr << "       Call comm_plan.create_graph_communicator() first." << std::endl;
-        return;
-    }
+//     if (!comm_plan_->has_graph_comm) {
+//         std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl;
+//         std::cerr << "       Call comm_plan.create_graph_communicator() first." << std::endl;
+//         return;
+//     }
     
-    // 1. Update host from device (ensure data is current on CPU for MPI)
-    this->update_host();
+//     // 1. Update host from device (ensure data is current on CPU for MPI)
+//     this->update_host();
     
-    // 2. Get raw pointer to data
-    T* data_ptr = this->host_pointer();
+//     // 2. Get raw pointer to data
+//     T* data_ptr = this->host_pointer();
     
-    // 3. Convert to double* for MPI communication
-    // TODO: Support other types (int, float, etc.) with template specialization
-    static_assert(std::is_same<T, double>::value, 
-                  "Currently only double supported for ghost communication");
+//     // 3. Convert to double* for MPI communication
+//     // TODO: Support other types (int, float, etc.) with template specialization
+//     static_assert(std::is_same<T, double>::value, 
+//                   "Currently only double supported for ghost communication");
     
-    double* double_ptr = reinterpret_cast<double*>(data_ptr);
+//     double* double_ptr = reinterpret_cast<double*>(data_ptr);
     
-    // 4. Call neighborhood collective exchange
-    comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast<int>(num_fields_));
+//     // 4. Call neighborhood collective exchange
+//     comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast<int>(num_fields_));
     
-    // 5. Update device with new ghost data
-    this->update_device();
-}
-
-
-/**
- * @brief Non-blocking version: start ghost exchange
- */
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_begin()
-{
-    // TODO: Implement non-blocking version using Isend/Irecv
-    // For now, just call blocking version
-    std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl;
-    communicate();
-}
-
-
-/**
- * @brief Wait for non-blocking ghost exchange to complete
- */
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_wait()
-{
-    // TODO: Implement non-blocking version
-    // For now, this is a no-op since communicate_begin() is blocking
-}
-
-
-#endif // MPIDARRAYKOKKOS_H
+//     // 5. Update device with new ghost data
+//     this->update_device();
+// }
+
+
+// /**
+//  * @brief Non-blocking version: start ghost exchange
+//  */
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_begin()
+// {
+//     // TODO: Implement non-blocking version using Isend/Irecv
+//     // For now, just call blocking version
+//     std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl;
+//     communicate();
+// }
+
+
+// /**
+//  * @brief Wait for non-blocking ghost exchange to complete
+//  */
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_wait()
+// {
+//     // TODO: Implement non-blocking version
+//     // For now, this is a no-op since communicate_begin() is blocking
+// }
+
+
+// #endif // MPIDARRAYKOKKOS_H
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 8afb9abf..01f54624 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -82,7 +82,7 @@ struct node_t
 // Possible gauss point states, used to initialize GaussPoint_t
 enum class gauss_pt_state
 {
-    volume
+    fields
 };
 
 /////////////////////////////////////////////////////////////////////////////
@@ -95,7 +95,7 @@ enum class gauss_pt_state
 struct GaussPoint_t
 {
 
-    DCArrayKokkos<double> vol;  ///< GaussPoint volume
+    DCArrayKokkos<double> fields;  ///< GaussPoint fields
     
 
     // initialization method (num_cells, num_dims)
@@ -104,8 +104,8 @@ struct GaussPoint_t
 
         for (auto field : gauss_pt_states){
             switch(field){
-                case gauss_pt_state::volume:
-                    if (vol.size() == 0) this->vol = DCArrayKokkos<double>(num_gauss_pnts, "gauss_point_volume");
+                case gauss_pt_state::fields:
+                    if (fields.size() == 0) this->fields = DCArrayKokkos<double>(num_gauss_pnts, "gauss_point_fields");
                     break;
                 default:
                     std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"<<std::endl;

From 0fe9c3de39330a6e8f9b3c6571ed0e5b60db6d73 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 5 Nov 2025 11:34:25 -0600
Subject: [PATCH 18/52] ENH: Working on defining the communication plan for MPI
 types

---
 examples/mesh_decomp/communication_plan.h | 534 ++++++++--------------
 examples/mesh_decomp/decomp_utils.h       | 165 ++-----
 examples/mesh_decomp/mpi_type.h           | 366 +++++++--------
 3 files changed, 413 insertions(+), 652 deletions(-)

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index 7c6f9ecb..1c95a40a 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -1,348 +1,216 @@
-// /**
-//  * @struct CommunicationPlan
-//  * @brief Manages efficient MPI communication for ghost element and node data exchange
-//  * 
-//  * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency.
-//  * Designed to be embedded in distributed data structures for automatic ghost synchronization.
-//  * 
-//  * Usage pattern in distributed structures:
-//  *   node.velocity.comm()  -> automatically syncs ghost nodes
-//  *   elem.density.comm()   -> automatically syncs ghost elements
-//  * 
-//  */
-//  struct CommunicationPlan {
-    
-//     // ========================================================================
-//     // CORE DATA STRUCTURES - FLAT ARRAYS ONLY
-//     // ========================================================================
+/**
+ * @struct CommunicationPlan
+ * @brief Manages efficient MPI communication for ghost element and node data exchange
+ * 
+ * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency.
+ * Designed to be embedded in distributed data structures for automatic ghost synchronization.
+ * 
+ * Usage pattern in distributed structures:
+ *   node.velocity.comm()  -> automatically syncs ghost nodes
+ *   elem.density.comm()   -> automatically syncs ghost elements
+ * 
+ */
+ struct CommunicationPlan {
+    
+    // ========================================================================
+    // Metadata for MPI neighbor graph communication 
+    // ========================================================================
 
+    // MPI world communicator
+    MPI_Comm mpi_comm_world;
+    bool has_comm_world = false;
+    int world_size = -1;
 
-//     // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes)
-//     int num_send_ranks;                            // Number of destination ranks
-//     DCArrayKokkos<size_t> send_rank_ids;                // [size: num_send_ranks] Destination rank IDs
-//     DCArrayKokkos<size_t> send_ghost_offsets;            // [size: num_send_ranks+1] CSR offsets into send_ghost_lids
-//     DCArrayKokkos<size_t> send_ghost_lids;               // [size: total_send_ghosts] Local IDs of owned elements/nodes to send
-//     std::vector<size_t> send_ghost_gids;            // [size: total_send_ghosts] Global IDs (for debug/validation)
-    
-//     // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes)
-//     int num_recv_ranks;                            // Number of source ranks
-//     DCArrayKokkos<size_t> recv_rank_ids;                // [size: num_recv_ranks] Source rank IDs
-//     DCArrayKokkos<size_t> recv_ghost_offsets;            // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids
-//     DCArrayKokkos<size_t> recv_ghost_lids;               // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned)
-//     std::vector<size_t> recv_ghost_gids;            // [size: total_recv_ghosts] Global IDs
+    // MPI graph communicator
+    MPI_Comm mpi_comm_graph;
+    bool has_comm_graph = false;
 
-    
-//     DCArrayKokkos<MPI_Request> send_requests;        // Request handles for sends
-//     DCArrayKokkos<MPI_Request> recv_requests;        // Request handles for receives
-//     DCArrayKokkos<MPI_Status> mpi_statuses;          // Status array for MPI_Waitall
-    
-//     // --- Persistent communication (optional optimization) ---
-//     DCArrayKokkos<MPI_Request> persistent_send_requests;
-//     DCArrayKokkos<MPI_Request> persistent_recv_requests;
-//     bool has_persistent_comm;
-    
-    
-//     // --- Distributed Graph Topology for Neighborhood Collectives ---
-//     MPI_Comm graph_comm;                           // Graph communicator encoding sparse communication pattern
-//     bool has_graph_comm;                            // Whether graph communicator is initialized
-    
-//     // Counts and displacements for MPI_Neighbor_alltoallv
-//     DCArrayKokkos<size_t> send_counts;                   // [num_send_ranks] Number of items to send per neighbor
-//     DCArrayKokkos<size_t> send_displs;                   // [num_send_ranks] Displacements in send buffer
-//     DCArrayKokkos<size_t> recv_counts;                   // [num_recv_ranks] Number of items to recv per neighbor
-//     DCArrayKokkos<size_t> recv_displs;                   // [num_recv_ranks] Displacements in recv buffer
-    
-//     // --- Persistent Neighborhood Collectives (MPI-4.0+) ---
-//     MPI_Request persistent_neighbor_request;        // Persistent request for neighborhood collective
-//     bool has_persistent_neighbor;                   // Whether persistent neighborhood is initialized
-//     int persistent_num_fields;                      // Fields per item for persistent request
-    
-    
-//     // ========================================================================
-//     // CONSTRUCTOR / INITIALIZATION
-//     // ========================================================================
-    
-//     CommunicationPlan() 
-//         : num_send_ranks(0), num_recv_ranks(0),
-//           has_persistent_comm(false),
-//           has_graph_comm(false),
-//           has_persistent_neighbor(false),
-//           graph_comm(MPI_COMM_NULL),
-//           persistent_neighbor_request(MPI_REQUEST_NULL),
-//           persistent_num_fields(0) {}
-    
-    
-//     // Destructor to free MPI resources
-//     ~CommunicationPlan() {
-//         // Free persistent neighborhood collective
-//         if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
-//             MPI_Request_free(&persistent_neighbor_request);
-//         }
-        
-//         // Free graph communicator
-//         if (has_graph_comm && graph_comm != MPI_COMM_NULL) {
-//             MPI_Comm_free(&graph_comm);
-//         }
-//     }
-    
-    
-//     void initialize(int num_send_ranks, int num_recv_ranks){
-//         this->num_send_ranks = num_send_ranks;
-//         this->num_recv_ranks = num_recv_ranks;
-        
-//         send_rank_ids = DCArrayKokkos<size_t>(num_send_ranks, "send_rank_ids");
-//         recv_rank_ids = DCArrayKokkos<size_t>(num_recv_ranks, "recv_rank_ids");
-//         send_ghost_offsets = DCArrayKokkos<size_t>(num_send_ranks + 1, "send_ghost_offsets");
-//         recv_ghost_offsets = DCArrayKokkos<size_t>(num_recv_ranks + 1, "recv_ghost_offsets");
-//         send_ghost_lids = DCArrayKokkos<size_t>(total_send_ghosts, "send_ghost_lids");
-//         recv_ghost_lids = DCArrayKokkos<size_t>(total_recv_ghosts, "recv_ghost_lids");
-//         send_ghost_gids = std::vector<size_t>(total_send_ghosts, "send_ghost_gids");
-//         recv_ghost_gids = std::vector<size_t>(total_recv_ghosts, "recv_ghost_gids");
-//         send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "send_requests");
-//         recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "recv_requests");
-//         mpi_statuses = DCArrayKokkos<MPI_Status>(total_send_ghosts + total_recv_ghosts, "mpi_statuses");
-//         persistent_send_requests = DCArrayKokkos<MPI_Request>(total_send_ghosts, "persistent_send_requests");
-//         persistent_recv_requests = DCArrayKokkos<MPI_Request>(total_recv_ghosts, "persistent_recv_requests");
-//         send_counts = DCArrayKokkos<size_t>(num_send_ranks, "send_counts");
-//         send_displs = DCArrayKokkos<size_t>(num_send_ranks, "send_displs");
-//         recv_counts = DCArrayKokkos<size_t>(num_recv_ranks, "recv_counts");
-//         recv_displs = DCArrayKokkos<size_t>(num_recv_ranks, "recv_displs");
-        
-//     }
-    
+    // Number of send and recv ranks
+    int num_send_ranks;  // In MPI language, this is the outdegree of the graph communicator
+    int num_recv_ranks;  // In MPI language, this is the indegree of the graph communicator
+
+    // Rank IDs for send and recv ranks
+    DCArrayKokkos<int> send_rank_ids;  // [size: num_send_ranks] Destination rank IDs
+    DCArrayKokkos<int> recv_rank_ids;  // [size: num_recv_ranks] Source rank IDs
+
+    // recv_weights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* recv_weights = MPI_UNWEIGHTED; // [size: num_recv_ranks] Weights on incoming edges, set to MPI_UNWEIGHTED if not used
+    
+    // send_weights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* send_weights = MPI_UNWEIGHTED; // [size: num_send_ranks] Weights on outgoing edges, set to MPI_UNWEIGHTED if not used
+    
+    // info: Hints for optimization (MPI_INFO_NULL means use defaults)
+    MPI_Info info = MPI_INFO_NULL;
+    
+    // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering)
+    // Setting to 0 preserves original rank numbering
+    // Note: In the future, we may want to allow MPI to reorder ranks for optimization by setting to 1, 
+    // this would allow MPI to reorder the ranks to make them physically closer on the hardware. 
+    // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs.
+    int reorder = 0; 
 
     
-//     // ========================================================================
-//     // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES
-//     // ========================================================================
+    // ========================================================================
+    // CONSTRUCTOR / INITIALIZATION
+    // ========================================================================
     
-//     /**
-//      * @brief Create distributed graph communicator from communication pattern
-//      */
-//     inline void create_graph_communicator(MPI_Comm base_comm) {
-        
-//         if (has_graph_comm) {
-//             std::cerr << "Warning: Graph communicator already created, skipping." << std::endl;
-//             return;
-//         }
-        
-//         int indegree = num_recv_ranks;   // Number of ranks we receive FROM
-//         int outdegree = num_send_ranks;  // Number of ranks we send TO
-        
-//         // Create the distributed graph communicator
-//         // MPI_Dist_graph_create_adjacent signature:
-//         //   (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights,
-//         //    info, reorder, comm_dist_graph)
-//         int reorder = 0;  // Don't reorder ranks (keep same as base_comm)
-        
-//         MPI_Dist_graph_create_adjacent(
-//             base_comm,                    // Base communicator
-//             indegree,                     // We receive from num_recv_ranks neighbors
-//             recv_rank_ids.data(),         // Source ranks (we receive from these)
-//             MPI_UNWEIGHTED,               // No edge weights for sources
-//             outdegree,                    // We send to num_send_ranks neighbors
-//             send_rank_ids.data(),         // Destination ranks (we send to these)
-//             MPI_UNWEIGHTED,               // No edge weights for destinations
-//             MPI_INFO_NULL,                // No special hints
-//             reorder,                      // Don't reorder ranks
-//             &graph_comm                   // Output: new graph communicator
-//         );
-        
-//         has_graph_comm = true;
-        
-//         // Pre-allocate counts and displacements arrays
-//         send_counts.resize(num_send_ranks);
-//         send_displs.resize(num_send_ranks);
-//         recv_counts.resize(num_recv_ranks);
-//         recv_displs.resize(num_recv_ranks);
-//     }
+    CommunicationPlan() 
+        : num_send_ranks(0), num_recv_ranks(0),
+          has_comm_graph(false) {}
     
     
-//     /**
-//      * @brief Exchange ghost data using MPI_Neighbor_alltoallv
-//      */
-//     inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) {
-        
-//         if (!has_graph_comm) {
-//             std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
-//             return;
-//         }
-        
-//         // 1. Pack send buffer from owned items
-//         int total_send = send_ghost_lids.size();
-//         ghost_send_buffer.resize(total_send * num_fields);
-        
-//         for (size_t i = 0; i < send_ghost_lids.size(); i++) {
-//             int local_id = send_ghost_lids[i];
-//             for (int f = 0; f < num_fields; f++) {
-//                 ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f];
-//             }
-//         }
-        
-//         // 2. Update counts and displacements for this num_fields
-//         for (int i = 0; i < num_send_ranks; i++) {
-//             int start_idx = send_ghost_offsets[i];
-//             int end_idx = send_ghost_offsets[i + 1];
-//             send_counts[i] = (end_idx - start_idx) * num_fields;
-//             send_displs[i] = start_idx * num_fields;
-//         }
-        
-//         int total_recv = recv_ghost_lids.size();
-//         ghost_recv_buffer.resize(total_recv * num_fields);
-        
-//         for (int i = 0; i < num_recv_ranks; i++) {
-//             int start_idx = recv_ghost_offsets[i];
-//             int end_idx = recv_ghost_offsets[i + 1];
-//             recv_counts[i] = (end_idx - start_idx) * num_fields;
-//             recv_displs[i] = start_idx * num_fields;
-//         }
-        
-//         // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm)
-//         // MPI_Neighbor_alltoallv signature:
-//         //   (sendbuf, sendcounts[], sdispls[], sendtype,
-//         //    recvbuf, recvcounts[], rdispls[], recvtype, comm)
-//         MPI_Neighbor_alltoallv(
-//             ghost_send_buffer.data(),    // Send buffer
-//             send_counts.data(),          // Send counts per neighbor
-//             send_displs.data(),          // Send displacements
-//             MPI_DOUBLE,                  // Send type
-//             ghost_recv_buffer.data(),    // Receive buffer
-//             recv_counts.data(),          // Receive counts per neighbor
-//             recv_displs.data(),          // Receive displacements
-//             MPI_DOUBLE,                  // Receive type
-//             graph_comm                   // Graph communicator (NOT MPI_COMM_WORLD!)
-//         );
-        
-//         // 4. Unpack receive buffer into ghost items
-//         for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
-//             int ghost_local_id = recv_ghost_lids[i];
-//             for (int f = 0; f < num_fields; f++) {
-//                 data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f];
-//             }
-//         }
-//     }
-    
+    // Destructor to free MPI resources
+    ~CommunicationPlan() {
+        // Free graph communicator
+        if (has_comm_graph && mpi_comm_graph != MPI_COMM_NULL) {
+            MPI_Comm_free(&mpi_comm_graph);
+        }
+    }
     
-//     /**
-//      * @brief Initialize persistent neighborhood collective (MPI-4.0+)
-//      */
-//     inline void init_persistent_neighborhood(int num_fields) {
-        
-//         if (!has_graph_comm) {
-//             std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl;
-//             return;
-//         }
-        
-//         if (has_persistent_neighbor) {
-//             std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl;
-//             free_persistent_neighborhood();
-//         }
-        
-//         persistent_num_fields = num_fields;
-        
-//         // Allocate buffers
-//         int total_send = send_ghost_lids.size();
-//         int total_recv = recv_ghost_lids.size();
-//         ghost_send_buffer.resize(total_send * num_fields);
-//         ghost_recv_buffer.resize(total_recv * num_fields);
-        
-//         // Setup counts and displacements for persistent request
-//         for (int i = 0; i < num_send_ranks; i++) {
-//             int start_idx = send_ghost_offsets[i];
-//             int end_idx = send_ghost_offsets[i + 1];
-//             send_counts[i] = (end_idx - start_idx) * num_fields;
-//             send_displs[i] = start_idx * num_fields;
-//         }
-        
-//         for (int i = 0; i < num_recv_ranks; i++) {
-//             int start_idx = recv_ghost_offsets[i];
-//             int end_idx = recv_ghost_offsets[i + 1];
-//             recv_counts[i] = (end_idx - start_idx) * num_fields;
-//             recv_displs[i] = start_idx * num_fields;
-//         }
-        
-// #if MPI_VERSION >= 4
-//         // MPI-4.0+ persistent neighborhood collective
-//         // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request):
-//         //   (sendbuf, sendcounts[], sdispls[], sendtype,
-//         //    recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request)
-//         MPI_Neighbor_alltoallv_init(
-//             ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE,
-//             ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE,
-//             graph_comm,
-//             MPI_INFO_NULL,
-//             &persistent_neighbor_request
-//         );
-//         has_persistent_neighbor = true;
-// #else
-//         int rank;
-//         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//         if (rank == 0) {
-//             std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl;
-//             std::cerr << "         Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl;
-//             std::cerr << "         Will fall back to standard neighborhood collective" << std::endl;
-//         }
-//         has_persistent_neighbor = false;
-// #endif
-//     }
     
+    void initialize(MPI_Comm comm_world){
+        this->mpi_comm_world = comm_world;
+        has_comm_world = true;
+        MPI_Comm_size(comm_world, &world_size);
+    }
     
-//     /**
-//      * @brief Exchange ghosts using persistent neighborhood collective (FASTEST)
-//      */
-//     inline void exchange_ghosts_persistent(double* data_ptr) {
+    void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){
         
-// #if MPI_VERSION >= 4
-//         if (!has_persistent_neighbor) {
-//             std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl;
-//             std::cerr << "       Falling back to standard neighborhood collective..." << std::endl;
-//             exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
-//             return;
-//         }
+        if(!has_comm_world){
+            throw std::runtime_error("MPI communicator for the world has not been initialized");
+        }
         
-//         // 1. Pack send buffer (same memory location as during init)
-//         for (size_t i = 0; i < send_ghost_lids.size(); i++) {
-//             int local_id = send_ghost_lids[i];
-//             for (int f = 0; f < persistent_num_fields; f++) {
-//                 ghost_send_buffer[i * persistent_num_fields + f] = 
-//                     data_ptr[local_id * persistent_num_fields + f];
-//             }
-//         }
-        
-//         // 2. Start persistent request (VERY fast - no setup overhead)
-//         MPI_Start(&persistent_neighbor_request);
-        
-//         // 3. Wait for completion
-//         MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE);
-        
-//         // 4. Unpack receive buffer
-//         for (size_t i = 0; i < recv_ghost_lids.size(); i++) {
-//             int ghost_id = recv_ghost_lids[i];
-//             for (int f = 0; f < persistent_num_fields; f++) {
-//                 data_ptr[ghost_id * persistent_num_fields + f] = 
-//                     ghost_recv_buffer[i * persistent_num_fields + f];
-//             }
-//         }
-// #else
-//         // Fallback to standard method if MPI-4 not available
-//         exchange_ghosts_neighborhood(data_ptr, persistent_num_fields);
-// #endif
-//     }
-    
-    
-//     /**
-//      * @brief Free persistent neighborhood collective resources
-//      */
-//     inline void free_persistent_neighborhood() {
-// #if MPI_VERSION >= 4
-//         if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) {
-//             MPI_Request_free(&persistent_neighbor_request);
-//             persistent_neighbor_request = MPI_REQUEST_NULL;
-//             has_persistent_neighbor = false;
-//         }
-// #endif
-//     }
-    
-// };
+        this->num_send_ranks = num_send_ranks;
+        this->num_recv_ranks = num_recv_ranks;
+
+        this->send_rank_ids = DCArrayKokkos<int>(num_send_ranks, "send_rank_ids");
+        for(int i = 0; i < num_send_ranks; i++){
+            this->send_rank_ids(i) = send_rank_ids[i];
+        }
+
+
+        this->recv_rank_ids = DCArrayKokkos<int>(num_recv_ranks, "recv_rank_ids");
+        for(int i = 0; i < num_recv_ranks; i++){
+            this->recv_rank_ids(i) = recv_rank_ids[i];
+        }
+
+        MPI_Dist_graph_create_adjacent(
+            mpi_comm_world,
+            num_recv_ranks,
+            this->recv_rank_ids.host_pointer(),
+            recv_weights,
+            num_send_ranks,
+            this->send_rank_ids.host_pointer(),
+            send_weights,
+            info,
+            reorder,
+            &mpi_comm_graph
+        );
+
+        has_comm_graph = true;
+    }
+
+    void verify_graph_communicator(){
+        if(!has_comm_graph){
+            throw std::runtime_error("MPI graph communicator has not been initialized");
+        }
+
+        // ============================================================================
+        // Verify the distributed graph communicator
+        // ============================================================================
+        // Query the graph to verify it matches what we specified
+        int indegree_out, outdegree_out, weighted;
+        MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted);
+        
+        // Allocate arrays to receive neighbor information
+        std::vector<int> sources_out(indegree_out);
+        std::vector<int> sourceweights_out(indegree_out);
+        std::vector<int> destinations_out(outdegree_out);
+        std::vector<int> destweights_out(outdegree_out);
+        
+        // Retrieve the actual neighbors from the graph communicator
+        MPI_Dist_graph_neighbors(mpi_comm_graph, 
+                                indegree_out, sources_out.data(), sourceweights_out.data(),
+                                outdegree_out, destinations_out.data(), destweights_out.data());
+        
+        int rank = -1;
+        MPI_Comm_rank(mpi_comm_world, &rank);
+
+        // Additional verification: Check if the queried values match our input
+        bool verification_passed = true;
+        
+        // Print verification information for each rank sequentially
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(mpi_comm_world);
+            if (rank == r) {
+                std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
+                std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
+                for (int i = 0; i < indegree_out; ++i) {
+                    std::cout << sources_out[i] << " ";
+                }
+                std::cout << std::endl;
+                
+                std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
+                for (int i = 0; i < outdegree_out; ++i) {
+                    std::cout << destinations_out[i] << " ";
+                }
+                std::cout << std::endl;
+                
+                std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
+            }
+            MPI_Barrier(mpi_comm_world);
+        }
+        
+        // Check if the counts match our stored values
+        if (indegree_out != num_recv_ranks) {
+            std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
+                      << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl;
+            verification_passed = false;
+        }
+        if (outdegree_out != num_send_ranks) {
+            std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
+                      << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl;
+            verification_passed = false;
+        }
+        
+        // Check if source ranks match (build set from our stored recv_rank_ids)
+        std::set<int> sources_set_in;
+        for (int i = 0; i < num_recv_ranks; ++i) {
+            sources_set_in.insert(recv_rank_ids.host(i));
+        }
+        std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
+        if (sources_set_in != sources_set_out) {
+            std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
+            verification_passed = false;
+        }
+        
+        // Check if destination ranks match (build set from our stored send_rank_ids)
+        std::set<int> dests_set_in;
+        for (int i = 0; i < num_send_ranks; ++i) {
+            dests_set_in.insert(send_rank_ids.host(i));
+        }
+        std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
+        if (dests_set_in != dests_set_out) {
+            std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
+            verification_passed = false;
+        }
+        
+        // Global verification check
+        int local_passed = verification_passed ? 1 : 0;
+        int global_passed = 0;
+        MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
+        MPI_Barrier(mpi_comm_world);
+        if (rank == 0) {
+            if (global_passed) {
+                std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
+            } else {
+                std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
+            }
+        }
+        MPI_Barrier(mpi_comm_world);
+    }
 
 
+};
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 752b39e6..d4981a17 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -14,6 +14,7 @@
 #include "mesh.h"
 #include "state.h"
 #include "mesh_io.h"
+#include "communication_plan.h"
 
 
 // Include Scotch headers
@@ -2077,13 +2078,14 @@ void partition_mesh(
 // ****************************************************************************************** 
 //     Create MPI distributed graph communicator for element communication
 // ****************************************************************************************** 
+
+
+    CommunicationPlan element_communication_plan;
+    element_communication_plan.initialize(MPI_COMM_WORLD);
     // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
     // that efficiently represents the communication pattern between ranks.
     // This allows MPI to optimize communication based on the actual connectivity pattern.
     
-    // ---------- Prepare input communicator ----------
-    // comm_old: The base communicator from which to create the graph communicator
-    MPI_Comm comm_old = MPI_COMM_WORLD;
     
     // ---------- Prepare INCOMING edges (sources) ----------
     // indegree: Number of ranks from which this rank will RECEIVE data
@@ -2096,6 +2098,7 @@ void partition_mesh(
     // sources: Array of source rank IDs (ranks we receive from)
     // Each element corresponds to a rank that owns elements we ghost
     int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
     // Could be used to specify communication volume if needed for optimization
@@ -2109,132 +2112,22 @@ void partition_mesh(
     // destinations: Array of destination rank IDs (ranks we send to)
     // Each element corresponds to a rank that ghosts our owned elements
     int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED;
-    
-    // destweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
-    // Could be used to specify communication volume if needed for optimization
-    int* destweights = MPI_UNWEIGHTED;
-    
-    // ---------- Additional parameters ----------
-    // info: Hints for optimization (MPI_INFO_NULL means use defaults)
-    MPI_Info info = MPI_INFO_NULL;
-    
-    // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering)
-    // Setting to 0 preserves original rank numbering
-    int reorder = 0;
-    
-    // ---------- Output communicator ----------
-    // graph_comm: The new distributed graph communicator that will be created
-    MPI_Comm graph_comm;
-    
-    // Create the distributed graph communicator
-    // This call collectively creates a communicator where each rank specifies:
-    //   - Which ranks it receives from (sources/indegree)
-    //   - Which ranks it sends to (destinations/outdegree)
-    // MPI can then optimize collective operations and point-to-point communication
-    // based on this connectivity information.
-    MPI_Dist_graph_create_adjacent(
-        comm_old,           // Input: base communicator
-        indegree,           // Input: number of incoming neighbors (ranks we receive from)
-        sources,            // Input: array of source ranks [indegree elements]
-        sourceweights,      // Input: weights on incoming edges (MPI_UNWEIGHTED)
-        outdegree,          // Input: number of outgoing neighbors (ranks we send to)
-        destinations,       // Input: array of destination ranks [outdegree elements]
-        destweights,        // Input: weights on outgoing edges (MPI_UNWEIGHTED)
-        info,               // Input: optimization hints (MPI_INFO_NULL)
-        reorder,            // Input: allow rank reordering (0=no)
-        &graph_comm         // Output: new distributed graph communicator
-    );
 
+    // Initialize the graph communicator for element communication
+    element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data());
+    
     // Optional: Verify the graph communicator was created successfully
     if (rank == 0) {
         std::cout << " Created MPI distributed graph communicator for element communication" << std::endl;
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
+    
+
     // ============================================================================
     // Verify the distributed graph communicator
     // ============================================================================
-    // Query the graph to verify it matches what we specified
-    int indegree_out, outdegree_out, weighted;
-    MPI_Dist_graph_neighbors_count(graph_comm, &indegree_out, &outdegree_out, &weighted);
-    
-    // Allocate arrays to receive neighbor information
-    std::vector<int> sources_out(indegree_out);
-    std::vector<int> sourceweights_out(indegree_out);
-    std::vector<int> destinations_out(outdegree_out);
-    std::vector<int> destweights_out(outdegree_out);
-    
-    // Retrieve the actual neighbors from the graph communicator
-    MPI_Dist_graph_neighbors(graph_comm, 
-                             indegree_out, sources_out.data(), sourceweights_out.data(),
-                             outdegree_out, destinations_out.data(), destweights_out.data());
-    
-    // Print verification information for each rank sequentially
-    for (int r = 0; r < world_size; ++r) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
-            std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
-            for (int i = 0; i < indegree_out; ++i) {
-                std::cout << sources_out[i] << " ";
-            }
-            std::cout << std::endl;
-            
-            std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
-            for (int i = 0; i < outdegree_out; ++i) {
-                std::cout << destinations_out[i] << " ";
-            }
-            std::cout << std::endl;
-            
-            std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-    
-    // Additional verification: Check if the queried values match our input
-    bool verification_passed = true;
-    if (indegree_out != indegree) {
-        std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
-                  << "Expected " << indegree << ", got " << indegree_out << std::endl;
-        verification_passed = false;
-    }
-    if (outdegree_out != outdegree) {
-        std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
-                  << "Expected " << outdegree << ", got " << outdegree_out << std::endl;
-        verification_passed = false;
-    }
-    
-    // Check if source and destination ranks match (order may differ)
-    std::set<int> sources_set_in(ghost_elem_receive_ranks_vec.begin(), ghost_elem_receive_ranks_vec.end());
-    std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
-    if (sources_set_in != sources_set_out) {
-        std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
-        verification_passed = false;
-    }
-    
-    std::set<int> dests_set_in(ghost_comm_ranks_vec.begin(), ghost_comm_ranks_vec.end());
-    std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
-    if (dests_set_in != dests_set_out) {
-        std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
-        verification_passed = false;
-    }
-    
-    // Global verification check
-    int local_passed = verification_passed ? 1 : 0;
-    int global_passed = 0;
-    MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    if (rank == 0) {
-        if (global_passed) {
-            std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
-        } else {
-            std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
-        }
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-
+    element_communication_plan.verify_graph_communicator();
 
 
 // ****************************************************************************************** 
@@ -2262,8 +2155,8 @@ void partition_mesh(
     //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
     //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
     
-    std::vector<int> elem_sendcounts(outdegree_out, 0);
-    std::vector<int> elem_sdispls(outdegree_out, 0);
+    std::vector<int> elem_sendcounts(element_communication_plan.num_send_ranks, 0);
+    std::vector<int> elem_sdispls(element_communication_plan.num_send_ranks, 0);
     
     // Count how many boundary elements go to each destination rank
     // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element
@@ -2280,8 +2173,8 @@ void partition_mesh(
     
     // Fill elem_sendcounts based on the graph communicator's destination order
     int total_send = 0;
-    for (int i = 0; i < outdegree_out; i++) {
-        int dest_rank = destinations_out[i];
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
         elem_sendcounts[i] = static_cast<int>(elems_to_send_by_rank[dest_rank].size());
         elem_sdispls[i] = total_send;
         total_send += elem_sendcounts[i];
@@ -2293,8 +2186,8 @@ void partition_mesh(
             MPI_Barrier(MPI_COMM_WORLD);
             if (rank == r) {
                 std::cout << "[rank " << rank << "] Send counts: ";
-                for (int i = 0; i < outdegree_out; i++) {
-                    std::cout << "to_rank_" << destinations_out[i] << "=" << elem_sendcounts[i] << " ";
+                for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+                    std::cout << "to_rank_" << element_communication_plan.send_rank_ids.host(i) << "=" << elem_sendcounts[i] << " ";
                 }
                 std::cout << "(total=" << total_send << ")" << std::endl;
             }
@@ -2306,8 +2199,8 @@ void partition_mesh(
     //   - elem_recvcounts[i] = number of elements to receive from i-th incoming neighbor (sources_out[i])
     //   - elem_rdispls[i] = starting position in recv buffer for i-th incoming neighbor
     
-    std::vector<int> elem_recvcounts(indegree_out, 0);
-    std::vector<int> elem_rdispls(indegree_out, 0);
+    std::vector<int> elem_recvcounts(element_communication_plan.num_recv_ranks, 0);
+    std::vector<int> elem_rdispls(element_communication_plan.num_recv_ranks, 0);
     
     // Count how many ghost elements come from each source rank
     // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element
@@ -2320,8 +2213,8 @@ void partition_mesh(
     
     // Fill elem_recvcounts based on the graph communicator's source order
     int total_recv = 0;
-    for (int i = 0; i < indegree_out; i++) {
-        int source_rank = sources_out[i];
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
         elem_recvcounts[i] = static_cast<int>(elems_to_recv_by_rank[source_rank].size());
         elem_rdispls[i] = total_recv;
         total_recv += elem_recvcounts[i];
@@ -2333,8 +2226,8 @@ void partition_mesh(
             MPI_Barrier(MPI_COMM_WORLD);
             if (rank == r) {
                 std::cout << "[rank " << rank << "] Recv counts: ";
-                for (int i = 0; i < indegree_out; i++) {
-                    std::cout << "from_rank_" << sources_out[i] << "=" << elem_recvcounts[i] << " ";
+                for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+                    std::cout << "from_rank_" << element_communication_plan.recv_rank_ids.host(i) << "=" << elem_recvcounts[i] << " ";
                 }
                 std::cout << "(total=" << total_recv << ", expected_ghosts=" << final_mesh.num_ghost_elems << ")" << std::endl;
             }
@@ -2346,8 +2239,8 @@ void partition_mesh(
     std::vector<double> elem_send_buffer(total_send);
     int send_idx = 0;
     
-    for (int i = 0; i < outdegree_out; i++) {
-        int dest_rank = destinations_out[i];
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
         const auto& elems_for_this_rank = elems_to_send_by_rank[dest_rank];
         
         for (int elem_lid : elems_for_this_rank) {
@@ -2373,7 +2266,7 @@ void partition_mesh(
         elem_recvcounts.data(),    // Number of elements to receive from each incoming neighbor [indegree]
         elem_rdispls.data(),       // Displacement in recv buffer for each incoming neighbor [indegree]
         MPI_DOUBLE,                // Receive data type
-        graph_comm                 // Distributed graph communicator
+        element_communication_plan.mpi_comm_graph                 // Distributed graph communicator
     );
     
     // ========== Update ghost element fields from receive buffer ==========
@@ -2383,8 +2276,8 @@ void partition_mesh(
     std::vector<bool> ghost_updated(final_mesh.num_ghost_elems, false);
     
     int recv_idx = 0;
-    for (int i = 0; i < indegree_out; i++) {
-        int source_rank = sources_out[i];
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
         const auto& ghost_indices = elems_to_recv_by_rank[source_rank];
         
         for (int ghost_idx : ghost_indices) {
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index 35b73985..f4731302 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -1,243 +1,243 @@
-// #ifndef MPIDARRAYKOKKOS_H
-// #define MPIDARRAYKOKKOS_H
-
-// #include "matar.h"
-// #include "communication_plan.h"
-
-// using namespace mtr;
-
-// /////////////////////////
-// // MPIDArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
-// // 
-// // Enhanced with automatic ghost synchronization via CommunicationPlan.
-// // Allocates space for owned + ghost items and provides communicate() method.
-// //
-// // Usage:
-// //   node.coords.communicate()  -> syncs ghost nodes automatically
-// //   elem.density.communicate() -> syncs ghost elements automatically
-// /////////////////////////
-// template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-// class MPIDArrayKokkos {
-
-//     // this is manage
-//     using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+#ifndef MPIDARRAYKOKKOS_H
+#define MPIDARRAYKOKKOS_H
+
+#include "matar.h"
+#include "communication_plan.h"
+
+using namespace mtr;
+
+/////////////////////////
+// MPIDArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+// 
+// Enhanced with automatic ghost synchronization via CommunicationPlan.
+// Allocates space for owned + ghost items and provides communicate() method.
+//
+// Usage:
+//   node.coords.communicate()  -> syncs ghost nodes automatically
+//   elem.density.communicate() -> syncs ghost elements automatically
+/////////////////////////
+template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+class MPIDArrayKokkos {
+
+    // this is manage
+    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
     
-// protected:
-//     size_t dims_[7];
-//     size_t length_;
-//     size_t order_;  // tensor order (rank)
-//     int mpi_recv_rank_;
-//     int mpi_tag_;
-//     MPI_Comm mpi_comm_;
-//     MPI_Status mpi_status_;
-//     MPI_Datatype mpi_datatype_;
-//     MPI_Request mpi_request_;
-//     TArray1D this_array_;
+protected:
+    size_t dims_[7];
+    size_t length_;
+    size_t order_;  // tensor order (rank)
+    int mpi_recv_rank_;
+    int mpi_tag_;
+    MPI_Comm mpi_comm_;
+    MPI_Status mpi_status_;
+    MPI_Datatype mpi_datatype_;
+    MPI_Request mpi_request_;
+    TArray1D this_array_;
     
-//     // --- Ghost Communication Support ---
-//     CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
-//     size_t num_owned_items_;            // Number of owned items (nodes/elements)
-//     size_t num_total_items_;            // Total items including ghosts (owned + ghost)
-//     size_t num_fields_;                 // Fields per item (e.g., 3 for 3D coordinates)
+    // --- Ghost Communication Support ---
+    CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
+    size_t num_owned_items_;            // Number of owned items (nodes/elements)
+    size_t num_total_items_;            // Total items including ghosts (owned + ghost)
+    size_t num_fields_;                 // Fields per item (e.g., 3 for 3D coordinates)
     
-//     void set_mpi_type();
+    void set_mpi_type();
 
-// public:
-//     // Data member to access host view
-//     ViewCArray <T> host;
+public:
+    // Data member to access host view
+    ViewCArray <T> host;
 
-//     MPIDArrayKokkos();
+    MPIDArrayKokkos();
     
-//     MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-//     MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-//                  size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-//                  size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-//                  size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-//     MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-//                  size_t dim3, size_t dim4, size_t dim5,
-//                  size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5,
+                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
     
     
-//     // ========================================================================
-//     // DISTRIBUTED COMMUNICATION METHODS (NEW)
-//     // ========================================================================
+    // ========================================================================
+    // DISTRIBUTED COMMUNICATION METHODS (NEW)
+    // ========================================================================
     
-//     /**
-//      * @brief Set communication plan and ghost metadata
-//      * 
-//      * Call this ONCE after allocating the array to enable ghost communication.
-//      * Multiple fields can share the same CommunicationPlan pointer.
-//      * 
-//      * @param plan Pointer to shared CommunicationPlan (node or element plan)
-//      * @param num_owned Number of owned items on this rank
-//      * @param num_total Total items including ghosts (owned + ghost)
-//      * 
-//      * Example:
-//      *   node.coords = MPIDArrayKokkos<double>(num_total_nodes, 3);
-//      *   node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes);
-//      */
-//     void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total);
+    /**
+     * @brief Set communication plan and ghost metadata
+     * 
+     * Call this ONCE after allocating the array to enable ghost communication.
+     * Multiple fields can share the same CommunicationPlan pointer.
+     * 
+     * @param plan Pointer to shared CommunicationPlan (node or element plan)
+     * @param num_owned Number of owned items on this rank
+     * @param num_total Total items including ghosts (owned + ghost)
+     * 
+     * Example:
+     *   node.coords = MPIDArrayKokkos<double>(num_total_nodes, 3);
+     *   node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes);
+     */
+    void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total);
     
     
-//     /**
-//      * @brief Synchronize ghost data using neighborhood collectives
-//      * 
-//      * Automatically exchanges boundary → ghost data for this field.
-//      * Uses the CommunicationPlan provided via set_communication_plan().
-//      * 
-//      * Workflow:
-//      * 1. Updates host data from device (if needed)
-//      * 2. Packs owned boundary items
-//      * 3. Calls MPI_Neighbor_alltoallv (via comm_plan)
-//      * 4. Unpacks into ghost items
-//      * 5. Updates device with new ghost data
-//      * 
-//      * Example usage:
-//      *   // Update owned nodes
-//      *   for (int i = 0; i < num_owned_nodes; i++) {
-//      *       node.coords(i, 0) += dt * velocity(i, 0);
-//      *   }
-//      *   
-//      *   // Sync ghosts
-//      *   node.coords.communicate();
-//      *   
-//      *   // Now ghost data is current
-//      */
-//     void communicate();
+    /**
+     * @brief Synchronize ghost data using neighborhood collectives
+     * 
+     * Automatically exchanges boundary → ghost data for this field.
+     * Uses the CommunicationPlan provided via set_communication_plan().
+     * 
+     * Workflow:
+     * 1. Updates host data from device (if needed)
+     * 2. Packs owned boundary items
+     * 3. Calls MPI_Neighbor_alltoallv (via comm_plan)
+     * 4. Unpacks into ghost items
+     * 5. Updates device with new ghost data
+     * 
+     * Example usage:
+     *   // Update owned nodes
+     *   for (int i = 0; i < num_owned_nodes; i++) {
+     *       node.coords(i, 0) += dt * velocity(i, 0);
+     *   }
+     *   
+     *   // Sync ghosts
+     *   node.coords.communicate();
+     *   
+     *   // Now ghost data is current
+     */
+    void communicate();
     
     
-//     /**
-//      * @brief Non-blocking version: start ghost exchange
-//      * 
-//      * For advanced users who want to overlap computation with communication.
-//      * Must call communicate_wait() before accessing ghost data.
-//      */
-//     void communicate_begin();
+    /**
+     * @brief Non-blocking version: start ghost exchange
+     * 
+     * For advanced users who want to overlap computation with communication.
+     * Must call communicate_wait() before accessing ghost data.
+     */
+    void communicate_begin();
     
     
-//     /**
-//      * @brief Wait for non-blocking ghost exchange to complete
-//      */
-//     void communicate_wait();
+    /**
+     * @brief Wait for non-blocking ghost exchange to complete
+     */
+    void communicate_wait();
     
     
-//     /**
-//      * @brief Get number of owned items (excludes ghosts)
-//      */
-//     KOKKOS_INLINE_FUNCTION
-//     size_t num_owned() const { return num_owned_items_; }
+    /**
+     * @brief Get number of owned items (excludes ghosts)
+     */
+    KOKKOS_INLINE_FUNCTION
+    size_t num_owned() const { return num_owned_items_; }
     
     
-//     /**
-//      * @brief Get total items including ghosts
-//      */
-//     KOKKOS_INLINE_FUNCTION
-//     size_t num_total() const { return num_total_items_; }
+    /**
+     * @brief Get total items including ghosts
+     */
+    KOKKOS_INLINE_FUNCTION
+    size_t num_total() const { return num_total_items_; }
     
     
-//     /**
-//      * @brief Check if ghost communication is configured
-//      */
-//     bool has_communication_plan() const { return comm_plan_ != nullptr; }
+    /**
+     * @brief Check if ghost communication is configured
+     */
+    bool has_communication_plan() const { return comm_plan_ != nullptr; }
     
-//     // These functions can setup the data needed for halo send/receives
-//     // Not necessary for standard MPI comms
-//     void mpi_setup();
+    // These functions can setup the data needed for halo send/receives
+    // Not necessary for standard MPI comms
+    void mpi_setup();
 
-//     void mpi_setup(int recv_rank);
+    void mpi_setup(int recv_rank);
 
-//     void mpi_setup(int recv_rank, int tag);
+    void mpi_setup(int recv_rank, int tag);
 
-//     void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
+    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
 
-//     void mpi_set_rank(int recv_rank);
+    void mpi_set_rank(int recv_rank);
 
-//     void mpi_set_tag(int tag);
+    void mpi_set_tag(int tag);
 
-//     void mpi_set_comm(MPI_Comm comm);
+    void mpi_set_comm(MPI_Comm comm);
 
-//     int get_rank();
+    int get_rank();
 
-//     int get_tag();
+    int get_tag();
 
-//     MPI_Comm get_comm();
+    MPI_Comm get_comm();
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i, size_t j) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i, size_t j, size_t k) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i, size_t j, size_t k, size_t l) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
-//                   size_t n) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
-//                   size_t n, size_t o) const;
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n, size_t o) const;
     
-//     KOKKOS_INLINE_FUNCTION
-//     MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
+    KOKKOS_INLINE_FUNCTION
+    MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
 
-//     // GPU Method
-//     // Method that returns size
-//     KOKKOS_INLINE_FUNCTION
-//     size_t size() const;
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
 
-//     // Host Method
-//     // Method that returns size
-//     KOKKOS_INLINE_FUNCTION
-//     size_t extent() const;
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     size_t dims(size_t i) const;
+    KOKKOS_INLINE_FUNCTION
+    size_t dims(size_t i) const;
 
-//     KOKKOS_INLINE_FUNCTION
-//     size_t order() const;
+    KOKKOS_INLINE_FUNCTION
+    size_t order() const;
  
-//     // Method returns the raw device pointer of the Kokkos DualView
-//     KOKKOS_INLINE_FUNCTION
-//     T* device_pointer() const;
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
 
-//     // Method returns the raw host pointer of the Kokkos DualView
-//     KOKKOS_INLINE_FUNCTION
-//     T* host_pointer() const;
+    // Method returns the raw host pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* host_pointer() const;
 
-//     // Method returns kokkos dual view
-//     KOKKOS_INLINE_FUNCTION
-//     TArray1D get_kokkos_dual_view() const;
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    TArray1D get_kokkos_dual_view() const;
 
-//     // Method that update host view
-//     void update_host();
+    // Method that update host view
+    void update_host();
 
-//     // Method that update device view
-//     void update_device();
+    // Method that update device view
+    void update_device();
 
     
 
-//     // Deconstructor
-//     virtual KOKKOS_INLINE_FUNCTION
-//     ~MPIDArrayKokkos ();
-// }; // End of MPIDArrayKokkos
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~MPIDArrayKokkos ();
+}; // End of MPIDArrayKokkos
 
 
 // // ============================================================================

From 276dba78da2f8586543341357c22bc2b0f6693df Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 5 Nov 2025 15:13:23 -0600
Subject: [PATCH 19/52] ENH: Fleshing out MPI type and communication plan

---
 examples/mesh_decomp/communication_plan.h |  86 ++++
 examples/mesh_decomp/decomp_utils.h       | 201 ++++-----
 examples/mesh_decomp/mesh_decomp.cpp      |   2 +-
 examples/mesh_decomp/mpi_type.h           | 486 +++++++++++-----------
 4 files changed, 428 insertions(+), 347 deletions(-)

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index 1c95a40a..b49befb8 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -51,6 +51,25 @@
     // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs.
     int reorder = 0; 
 
+
+
+
+
+    DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank
+    DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank
+
+    DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
+    DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
+    
+    
+    DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
+    DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
+
+    int total_send_count;
+    int total_recv_count;
+
+    
+
     
     // ========================================================================
     // CONSTRUCTOR / INITIALIZATION
@@ -213,4 +232,71 @@
     }
 
 
+    void setup_send_recv(DRaggedRightArrayKokkos<int> &rank_send_ids, DRaggedRightArrayKokkos<int> &rank_recv_ids){
+
+        this->send_indices_ = rank_send_ids;
+        this->recv_indices_ = rank_recv_ids;
+
+
+        // Setup send data
+        this->send_counts_ = DCArrayKokkos<int>(num_send_ranks, "send_counts");
+        this->total_send_count = 0;
+        for(int i = 0; i < num_send_ranks; i++){
+            this->send_counts_.host(i) = rank_send_ids.stride_host(i);
+            this->total_send_count += this->send_counts_.host(i);
+        }
+        this->send_counts_.update_device();
+
+        this->send_displs_ = DCArrayKokkos<int>(num_send_ranks, "send_displs");
+        for(int i = 0; i < num_send_ranks; i++){
+            this->send_displs_.host(i) = 0;
+            for(int j = 0; j < i; j++){
+                this->send_displs_.host(i) += this->send_counts_.host(j);
+            }
+        }
+        this->send_displs_.update_device();
+
+        // Setup recv data
+        this->recv_counts_ = DCArrayKokkos<int>(num_recv_ranks, "recv_counts");
+        this->total_recv_count = 0;
+        for(int i = 0; i < num_recv_ranks; i++){
+            this->recv_counts_.host(i) = rank_recv_ids.stride_host(i);
+            this->total_recv_count += this->recv_counts_.host(i);
+        }
+        this->recv_counts_.update_device();
+
+        this->recv_displs_ = DCArrayKokkos<int>(num_recv_ranks, "recv_displs");
+        for(int i = 0; i < num_recv_ranks; i++){
+            this->recv_displs_.host(i) = 0;
+            for(int j = 0; j < i; j++){
+                this->recv_displs_.host(i) += this->recv_counts_.host(j);
+            }
+        }
+        this->recv_displs_.update_device();
+
+
+        // Print the send and recv data sequentially per MPI rank for clarity
+        MPI_Barrier(mpi_comm_world);
+        int rank, nprocs;
+        MPI_Comm_rank(mpi_comm_world, &rank);
+        MPI_Comm_size(mpi_comm_world, &nprocs);
+        for(int r = 0; r < nprocs; r++) {
+            MPI_Barrier(mpi_comm_world);
+            if(rank == r) {
+                std::cout << "==============================" << std::endl;
+                std::cout << "CommunicationPlan info for rank " << rank << std::endl;
+                for(int i = 0; i < num_send_ranks; i++){
+                    std::cout << "  Send count to rank[" << i << "] (dest rank " << this->send_rank_ids.host(i) << "): " << this->send_counts_.host(i) << std::endl;
+                    std::cout << "  Send displs to rank[" << i << "]: " << this->send_displs_.host(i) << std::endl;
+                }
+                for(int i = 0; i < num_recv_ranks; i++){
+                    std::cout << "  Recv count from rank[" << i << "] (source rank " << this->recv_rank_ids.host(i) << "): " << this->recv_counts_.host(i) << std::endl;
+                    std::cout << "  Recv displs from rank[" << i << "]: " << this->recv_displs_.host(i) << std::endl;
+                }
+                std::cout << "==============================" << std::endl << std::flush;
+            }
+        }
+        MPI_Barrier(mpi_comm_world);
+    }
+
 };
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index d4981a17..2a256deb 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -717,9 +717,9 @@ void partition_mesh(
 
 
 
-    if (print_vtk) {
-        write_vtk(naive_mesh, naive_node, rank);
-    }
+    // if (print_vtk) {
+    //     write_vtk(naive_mesh, naive_node, rank);
+    // }
 
 
 
@@ -2076,7 +2076,7 @@ void partition_mesh(
 
 
 // ****************************************************************************************** 
-//     Create MPI distributed graph communicator for element communication
+//     Create Communication Plan for element communication
 // ****************************************************************************************** 
 
 
@@ -2115,48 +2115,22 @@ void partition_mesh(
 
     // Initialize the graph communicator for element communication
     element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data());
-    
-    // Optional: Verify the graph communicator was created successfully
-    if (rank == 0) {
-        std::cout << " Created MPI distributed graph communicator for element communication" << std::endl;
-    }
     MPI_Barrier(MPI_COMM_WORLD);
-
-    
-
-    // ============================================================================
-    // Verify the distributed graph communicator
-    // ============================================================================
-    element_communication_plan.verify_graph_communicator();
+    // Optional: Verify the graph communicator was created successfully
+    if(print_info) element_communication_plan.verify_graph_communicator();
 
 
 // ****************************************************************************************** 
-//     Test element communication using MPI_Neighbor_alltoallv
+//     Build send counts and displacements for element communication
 // ****************************************************************************************** 
-    // Gauss points share the same communication plan as elements.
-    // This test initializes gauss point fields on owned elements and exchanges them with ghost elements.
-
-    print_info = true;  // Enable debug output for communication test
-
-    gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields});
-
-    // Initialize the gauss point fields on each rank
-    // Set owned elements to rank number, ghost elements to -1 (to verify communication)
-    for (int i = 0; i < final_mesh.num_owned_elems; i++) {
-        gauss_point.fields.host(i) = static_cast<double>(rank);
-    }
-    for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
-        gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
-    }
-    gauss_point.fields.update_device();
 
-    // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
+ // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
     // For MPI_Neighbor_alltoallv with graph communicator:
     //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
     //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
     
-    std::vector<int> elem_sendcounts(element_communication_plan.num_send_ranks, 0);
-    std::vector<int> elem_sdispls(element_communication_plan.num_send_ranks, 0);
+    // std::vector<int> elem_sendcounts(element_communication_plan.num_send_ranks, 0);
+    // std::vector<int> elem_sdispls(element_communication_plan.num_send_ranks, 0);
     
     // Count how many boundary elements go to each destination rank
     // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element
@@ -2170,37 +2144,24 @@ void partition_mesh(
             }
         }
     }
-    
-    // Fill elem_sendcounts based on the graph communicator's destination order
-    int total_send = 0;
+
+    // Serialize into a DRaggedRightArrayKokkos
+    CArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks);
     for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
         int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        elem_sendcounts[i] = static_cast<int>(elems_to_send_by_rank[dest_rank].size());
-        elem_sdispls[i] = total_send;
-        total_send += elem_sendcounts[i];
+        strides_array(i) = elems_to_send_by_rank[dest_rank].size();
     }
-    
-    // Debug: Print send counts
-    if (print_info) {
-        for (int r = 0; r < world_size; ++r) {
-            MPI_Barrier(MPI_COMM_WORLD);
-            if (rank == r) {
-                std::cout << "[rank " << rank << "] Send counts: ";
-                for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
-                    std::cout << "to_rank_" << element_communication_plan.send_rank_ids.host(i) << "=" << elem_sendcounts[i] << " ";
-                }
-                std::cout << "(total=" << total_send << ")" << std::endl;
-            }
-            MPI_Barrier(MPI_COMM_WORLD);
+    DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
+
+    // Fill in the data
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
+        for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) {
+            elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j];
         }
     }
-    
-    // ========== Build receive counts and displacements for INCOMING neighbors (sources) ==========
-    //   - elem_recvcounts[i] = number of elements to receive from i-th incoming neighbor (sources_out[i])
-    //   - elem_rdispls[i] = starting position in recv buffer for i-th incoming neighbor
-    
-    std::vector<int> elem_recvcounts(element_communication_plan.num_recv_ranks, 0);
-    std::vector<int> elem_rdispls(element_communication_plan.num_recv_ranks, 0);
+    elems_to_send_by_rank_rr.update_device();
+
     
     // Count how many ghost elements come from each source rank
     // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element
@@ -2210,61 +2171,102 @@ void partition_mesh(
         int source_rank = ghost_elem_owner_ranks[i];
         elems_to_recv_by_rank[source_rank].push_back(static_cast<int>(i));
     }
-    
-    // Fill elem_recvcounts based on the graph communicator's source order
-    int total_recv = 0;
+
+    // ========== Serialize into a DRaggedRightArrayKokkos ==========
+    CArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks);
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
         int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        elem_recvcounts[i] = static_cast<int>(elems_to_recv_by_rank[source_rank].size());
-        elem_rdispls[i] = total_recv;
-        total_recv += elem_recvcounts[i];
+        elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
     }
-    
-    // Debug: Print receive counts
-    if (print_info) {
-        for (int r = 0; r < world_size; ++r) {
-            MPI_Barrier(MPI_COMM_WORLD);
-            if (rank == r) {
-                std::cout << "[rank " << rank << "] Recv counts: ";
-                for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
-                    std::cout << "from_rank_" << element_communication_plan.recv_rank_ids.host(i) << "=" << elem_recvcounts[i] << " ";
-                }
-                std::cout << "(total=" << total_recv << ", expected_ghosts=" << final_mesh.num_ghost_elems << ")" << std::endl;
-            }
-            MPI_Barrier(MPI_COMM_WORLD);
+    DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
+    // Fill in the data
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
+        for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) {
+            elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j];
         }
     }
+    elems_to_recv_by_rank_rr.update_device();
+
+    element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building the send and recv counts and displacements for element communication" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+// ****************************************************************************************** 
+//     Test element communication using MPI_Neighbor_alltoallv
+// ****************************************************************************************** 
+    // Gauss points share the same communication plan as elements.
+    // This test initializes gauss point fields on owned elements and exchanges them with ghost elements.
+
+    print_info = true;  // Enable debug output for communication test
+
+    gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}); // , &element_communication_plan
+
+    // Initialize the gauss point fields on each rank
+    // Set owned elements to rank number, ghost elements to -1 (to verify communication)
+    for (int i = 0; i < final_mesh.num_owned_elems; i++) {
+        gauss_point.fields.host(i) = static_cast<double>(rank);
+    }
+    for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
+        gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
+    }
+    gauss_point.fields.update_device();
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build the send buffer for element communication" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
     
     // ========== Build send buffer organized by destination rank ==========
-    std::vector<double> elem_send_buffer(total_send);
+    std::vector<double> elem_send_buffer(element_communication_plan.total_send_count);
     int send_idx = 0;
     
     for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
-        int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        const auto& elems_for_this_rank = elems_to_send_by_rank[dest_rank];
+        // Get the number of elements to send to this neighbor
+        size_t num_elems_to_send = elems_to_send_by_rank_rr.stride_host(i);
+        if(rank == 0) std::cout << " Sending " << num_elems_to_send << " elements to rank " << element_communication_plan.send_rank_ids.host(i) << std::endl;
         
-        for (int elem_lid : elems_for_this_rank) {
-            elem_send_buffer[send_idx++] = gauss_point.fields.host(elem_lid);
+        for (size_t j = 0; j < num_elems_to_send; j++) {
+            int elem_lid = elems_to_send_by_rank_rr.host(i, j);
+            if(rank == 0) std::cout << " Sending element " << elem_lid << std::endl;
+            double value = gauss_point.fields.host(elem_lid);
+
+            if(rank == 0) std::cout << " Value: " << value << std::endl;
+            elem_send_buffer[send_idx++] = value;
         }
     }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building the send buffer for element communication" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
     
     // ========== Allocate receive buffer ==========
-    std::vector<double> elem_recv_buffer(total_recv);
+    std::vector<double> elem_recv_buffer(element_communication_plan.total_recv_count);
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building the receive buffer for element communication" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+   
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to exchange element data using MPI_Neighbor_alltoallv" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
     
-    // ========== Exchange data using MPI_Neighbor_alltoallv ==========
-    // MPI_Neighbor_alltoallv exchanges data with neighbors in the graph communicator topology
-    // - elem_sendcounts[i]: number of doubles to send to i-th outgoing neighbor
-    // - elem_recvcounts[i]: number of doubles to receive from i-th incoming neighbor
-    // - The order of neighbors must match the order returned by MPI_Dist_graph_neighbors
+
     
     MPI_Neighbor_alltoallv(
         elem_send_buffer.data(),   // Send buffer with boundary element data
-        elem_sendcounts.data(),    // Number of elements to send to each outgoing neighbor [outdegree]
-        elem_sdispls.data(),       // Displacement in send buffer for each outgoing neighbor [outdegree]
+        element_communication_plan.send_counts_.host_pointer(),           // Number of elements to send to each outgoing neighbor [outdegree]
+        element_communication_plan.send_displs_.host_pointer(),           // Displacement in send buffer for each outgoing neighbor [outdegree]
         MPI_DOUBLE,                // Send data type
         elem_recv_buffer.data(),   // Receive buffer for ghost element data
-        elem_recvcounts.data(),    // Number of elements to receive from each incoming neighbor [indegree]
-        elem_rdispls.data(),       // Displacement in recv buffer for each incoming neighbor [indegree]
+        element_communication_plan.recv_counts_.host_pointer(),           // Number of elements to receive from each incoming neighbor [indegree]
+        element_communication_plan.recv_displs_.host_pointer(),           // Displacement in recv buffer for each incoming neighbor [indegree]
         MPI_DOUBLE,                // Receive data type
         element_communication_plan.mpi_comm_graph                 // Distributed graph communicator
     );
@@ -2277,10 +2279,11 @@ void partition_mesh(
     
     int recv_idx = 0;
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
-        int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        const auto& ghost_indices = elems_to_recv_by_rank[source_rank];
+        // Get the number of ghost elements from this source rank
+        size_t num_ghosts_from_source = elems_to_recv_by_rank_rr.stride_host(i);
         
-        for (int ghost_idx : ghost_indices) {
+        for (size_t j = 0; j < num_ghosts_from_source; j++) {
+            int ghost_idx = elems_to_recv_by_rank_rr.host(i, j);
             int ghost_elem_local_id = final_mesh.num_owned_elems + ghost_idx;
             gauss_point.fields.host(ghost_elem_local_id) = elem_recv_buffer[recv_idx++];  
             ghost_updated[ghost_idx] = true;
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index b14ee9cd..e1383ccb 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {50, 50, 50};
+    int num_elems_dim[3] = {20, 20, 20};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index f4731302..636a1bd0 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -1,5 +1,5 @@
-#ifndef MPIDARRAYKOKKOS_H
-#define MPIDARRAYKOKKOS_H
+#ifndef MPICARRAYKOKKOS_H
+#define MPICARRAYKOKKOS_H
 
 #include "matar.h"
 #include "communication_plan.h"
@@ -7,38 +7,42 @@
 using namespace mtr;
 
 /////////////////////////
-// MPIDArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+// MPICArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
 // 
-// Enhanced with automatic ghost synchronization via CommunicationPlan.
-// Allocates space for owned + ghost items and provides communicate() method.
-//
-// Usage:
-//   node.coords.communicate()  -> syncs ghost nodes automatically
-//   elem.density.communicate() -> syncs ghost elements automatically
 /////////////////////////
 template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class MPIDArrayKokkos {
+class MPICArrayKokkos {
 
-    // this is manage
-    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    // Dual view for managing data on both CPU and GPU
+    DCArrayKokkos<T> this_array_;
     
 protected:
     size_t dims_[7];
     size_t length_;
     size_t order_;  // tensor order (rank)
-    int mpi_recv_rank_;
-    int mpi_tag_;
+
     MPI_Comm mpi_comm_;
     MPI_Status mpi_status_;
     MPI_Datatype mpi_datatype_;
     MPI_Request mpi_request_;
-    TArray1D this_array_;
+
     
     // --- Ghost Communication Support ---
     CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
-    size_t num_owned_items_;            // Number of owned items (nodes/elements)
-    size_t num_total_items_;            // Total items including ghosts (owned + ghost)
-    size_t num_fields_;                 // Fields per item (e.g., 3 for 3D coordinates)
+
+
+    DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
+    DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
+    DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
+    DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
+
+
+    DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank
+    DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank
+    
+    
+    size_t num_owned_;            // Number of owned items (nodes/elements)
+    size_t num_ghost_;            // Number of ghost items (nodes/elements)
     
     void set_mpi_type();
 
@@ -46,131 +50,28 @@ class MPIDArrayKokkos {
     // Data member to access host view
     ViewCArray <T> host;
 
-    MPIDArrayKokkos();
+    MPICArrayKokkos();
     
-    MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, size_t dim4, size_t dim5,
                  size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
     
-    
-    // ========================================================================
-    // DISTRIBUTED COMMUNICATION METHODS (NEW)
-    // ========================================================================
-    
-    /**
-     * @brief Set communication plan and ghost metadata
-     * 
-     * Call this ONCE after allocating the array to enable ghost communication.
-     * Multiple fields can share the same CommunicationPlan pointer.
-     * 
-     * @param plan Pointer to shared CommunicationPlan (node or element plan)
-     * @param num_owned Number of owned items on this rank
-     * @param num_total Total items including ghosts (owned + ghost)
-     * 
-     * Example:
-     *   node.coords = MPIDArrayKokkos<double>(num_total_nodes, 3);
-     *   node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes);
-     */
-    void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total);
-    
-    
-    /**
-     * @brief Synchronize ghost data using neighborhood collectives
-     * 
-     * Automatically exchanges boundary → ghost data for this field.
-     * Uses the CommunicationPlan provided via set_communication_plan().
-     * 
-     * Workflow:
-     * 1. Updates host data from device (if needed)
-     * 2. Packs owned boundary items
-     * 3. Calls MPI_Neighbor_alltoallv (via comm_plan)
-     * 4. Unpacks into ghost items
-     * 5. Updates device with new ghost data
-     * 
-     * Example usage:
-     *   // Update owned nodes
-     *   for (int i = 0; i < num_owned_nodes; i++) {
-     *       node.coords(i, 0) += dt * velocity(i, 0);
-     *   }
-     *   
-     *   // Sync ghosts
-     *   node.coords.communicate();
-     *   
-     *   // Now ghost data is current
-     */
-    void communicate();
-    
-    
-    /**
-     * @brief Non-blocking version: start ghost exchange
-     * 
-     * For advanced users who want to overlap computation with communication.
-     * Must call communicate_wait() before accessing ghost data.
-     */
-    void communicate_begin();
-    
-    
-    /**
-     * @brief Wait for non-blocking ghost exchange to complete
-     */
-    void communicate_wait();
-    
-    
-    /**
-     * @brief Get number of owned items (excludes ghosts)
-     */
-    KOKKOS_INLINE_FUNCTION
-    size_t num_owned() const { return num_owned_items_; }
-    
-    
-    /**
-     * @brief Get total items including ghosts
-     */
-    KOKKOS_INLINE_FUNCTION
-    size_t num_total() const { return num_total_items_; }
-    
-    
-    /**
-     * @brief Check if ghost communication is configured
-     */
-    bool has_communication_plan() const { return comm_plan_ != nullptr; }
-    
-    // These functions can setup the data needed for halo send/receives
-    // Not necessary for standard MPI comms
-    void mpi_setup();
-
-    void mpi_setup(int recv_rank);
-
-    void mpi_setup(int recv_rank, int tag);
-
-    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
-
-    void mpi_set_rank(int recv_rank);
-
-    void mpi_set_tag(int tag);
 
-    void mpi_set_comm(MPI_Comm comm);
-
-    int get_rank();
-
-    int get_tag();
-
-    MPI_Comm get_comm();
 
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i) const;
@@ -198,6 +99,13 @@ class MPIDArrayKokkos {
     KOKKOS_INLINE_FUNCTION
     MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
 
+
+    // Method to set comm plan
+    KOKKOS_INLINE_FUNCTION
+    void initialize_comm_plan(CommunicationPlan* comm_plan);
+
+
+
     // GPU Method
     // Method that returns size
     KOKKOS_INLINE_FUNCTION
@@ -240,121 +148,205 @@ class MPIDArrayKokkos {
 }; // End of MPIDArrayKokkos
 
 
-// // ============================================================================
-// // INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION
-// // ============================================================================
-
-// /**
-//  * @brief Default constructor - initialize ghost communication members
-//  */
-// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-// KOKKOS_INLINE_FUNCTION
-// MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::MPIDArrayKokkos() 
-//     : comm_plan_(nullptr), 
-//       num_owned_items_(0), 
-//       num_total_items_(0), 
-//       num_fields_(0) 
-// {
-//     // Base constructor handles array initialization
-// }
-
-
-// /**
-//  * @brief Set communication plan and ghost metadata
-//  */
-// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::set_communication_plan(
-//     CommunicationPlan* plan, 
-//     size_t num_owned, 
-//     size_t num_total)
-// {
-//     comm_plan_ = plan;
-//     num_owned_items_ = num_owned;
-//     num_total_items_ = num_total;
-    
-//     // Infer number of fields from array dimensions
-//     // Assumption: dim0 = num_items, dim1+ = fields
-//     if (order_ == 1) {
-//         num_fields_ = 1;  // Scalar field
-//     } else if (order_ == 2) {
-//         num_fields_ = dims_[1];  // Vector field (e.g., coords[num_nodes, 3])
-//     } else {
-//         // For higher order tensors, treat everything after dim0 as fields
-//         num_fields_ = 1;
-//         for (size_t i = 1; i < order_; i++) {
-//             num_fields_ *= dims_[i];
-//         }
-//     }
-    
-//     // Validate dimensions match total items
-//     if (dims_[0] != num_total) {
-//         std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" 
-//                   << num_total << ")" << std::endl;
-//         std::cerr << "       Array must be allocated with size = num_owned + num_ghost" << std::endl;
-//     }
-// }
-
-
-// /**
-//  * @brief Synchronize ghost data using neighborhood collectives
-//  */
-// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate()
-// {
-//     if (!comm_plan_) {
-//         std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl;
-//         return;
-//     }
-    
-//     if (!comm_plan_->has_graph_comm) {
-//         std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl;
-//         std::cerr << "       Call comm_plan.create_graph_communicator() first." << std::endl;
-//         return;
-//     }
-    
-//     // 1. Update host from device (ensure data is current on CPU for MPI)
-//     this->update_host();
-    
-//     // 2. Get raw pointer to data
-//     T* data_ptr = this->host_pointer();
-    
-//     // 3. Convert to double* for MPI communication
-//     // TODO: Support other types (int, float, etc.) with template specialization
-//     static_assert(std::is_same<T, double>::value, 
-//                   "Currently only double supported for ghost communication");
-    
-//     double* double_ptr = reinterpret_cast<double*>(data_ptr);
-    
-//     // 4. Call neighborhood collective exchange
-//     comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast<int>(num_fields_));
-    
-//     // 5. Update device with new ghost data
-//     this->update_device();
-// }
-
-
-// /**
-//  * @brief Non-blocking version: start ghost exchange
-//  */
-// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_begin()
-// {
-//     // TODO: Implement non-blocking version using Isend/Irecv
-//     // For now, just call blocking version
-//     std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl;
-//     communicate();
-// }
-
-
-// /**
-//  * @brief Wait for non-blocking ghost exchange to complete
-//  */
-// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-// inline void MPIDArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::communicate_wait()
-// {
-//     // TODO: Implement non-blocking version
-//     // For now, this is a no-op since communicate_begin() is blocking
-// }
-
-
-// #endif // MPIDARRAYKOKKOS_H
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
+    : this_array_() { }
+
+// Overloaded 1D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0);
+}
+
+// Overloaded 2D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1);
+}
+
+// Overloaded 3D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2);
+}
+
+// Overloaded 4D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3);
+}
+
+// Overloaded 5D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4);
+}
+
+// Overloaded 6D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5);
+}
+
+// Overloaded 7D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) {
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
+}
+
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!");
+    return this_array_(i);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!");
+    return this_array_(i, j);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!");
+    return this_array_(i, j, k);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!");
+    return this_array_(i, j, k, l);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!");
+    return this_array_(i, j, k, l, m);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!");
+    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!");
+    return this_array_(i, j, k, l, m, n);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!");
+    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!");
+    assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!");
+    return this_array_(i, j, k, l, m, n, o);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPIDArrayKokkos& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPIDArrayKokkos& temp) {
+    this_array_ = temp.this_array_;
+    return *this;
+}
+
+// Return size
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return this_array_.size();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return this_array_.extent();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
+    assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!");
+    return this_array_.dims(i);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
+    return this_array_.order();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.device_pointer();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+    return this_array_.host_pointer();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+    return this_array_.get_kokkos_dual_view();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+    this_array_.update_host();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+    this_array_.update_device();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
+    this_array_.~DCArrayKokkos();
+}
+
+#endif
\ No newline at end of file

From c4fac4d2bf3398ef6692250cd798a21a26a6e178 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 5 Nov 2025 17:46:09 -0600
Subject: [PATCH 20/52] ENH: Handling comms via data structure

---
 examples/mesh_decomp/communication_plan.h |  19 +-
 examples/mesh_decomp/decomp_utils.h       | 364 ++++++++--------------
 examples/mesh_decomp/mesh_decomp.cpp      |   2 +-
 examples/mesh_decomp/mpi_type.h           | 201 +++++++++++-
 examples/mesh_decomp/state.h              |  14 +-
 5 files changed, 340 insertions(+), 260 deletions(-)

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index b49befb8..16904e57 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -1,3 +1,10 @@
+#ifndef COMMUNICATION_PLAN_H
+#define COMMUNICATION_PLAN_H
+
+#include "matar.h"
+
+using namespace mtr;
+
 /**
  * @struct CommunicationPlan
  * @brief Manages efficient MPI communication for ghost element and node data exchange
@@ -69,7 +76,7 @@
     int total_recv_count;
 
     
-
+    
     
     // ========================================================================
     // CONSTRUCTOR / INITIALIZATION
@@ -103,7 +110,7 @@
         
         this->num_send_ranks = num_send_ranks;
         this->num_recv_ranks = num_recv_ranks;
-
+        
         this->send_rank_ids = DCArrayKokkos<int>(num_send_ranks, "send_rank_ids");
         for(int i = 0; i < num_send_ranks; i++){
             this->send_rank_ids(i) = send_rank_ids[i];
@@ -114,7 +121,7 @@
         for(int i = 0; i < num_recv_ranks; i++){
             this->recv_rank_ids(i) = recv_rank_ids[i];
         }
-
+        
         MPI_Dist_graph_create_adjacent(
             mpi_comm_world,
             num_recv_ranks,
@@ -234,8 +241,8 @@
 
     void setup_send_recv(DRaggedRightArrayKokkos<int> &rank_send_ids, DRaggedRightArrayKokkos<int> &rank_recv_ids){
 
-        this->send_indices_ = rank_send_ids;
-        this->recv_indices_ = rank_recv_ids;
+        this->send_indices_ = rank_send_ids; // ods of element data to send to each rank
+        this->recv_indices_ = rank_recv_ids; //
 
 
         // Setup send data
@@ -300,3 +307,5 @@
     }
 
 };
+
+#endif // COMMUNICATION_PLAN_H
\ No newline at end of file
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 2a256deb..49e1113e 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -1893,14 +1893,14 @@ void partition_mesh(
     final_node.coords.update_device();
 
 
-// --------------------------------------------------------------------------------------
+    // --------------------------------------------------------------------------------------
 // Build the send patterns for elements
-// Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
-// Steps:
-// 1) Each rank contributes its ghost element GIDs.
-// 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
-// 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
-// --------------------------------------------------------------------------------------
+    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost element GIDs.
+    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
     std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(intermediate_mesh.num_elems);
 
     // Prepare local ghost list as vector
@@ -1960,7 +1960,7 @@ void partition_mesh(
 
     std::cout.flush();
     MPI_Barrier(MPI_COMM_WORLD);
-
+    
 
     // Optional: print a compact summary of reverse map for verification (limited output)
     for(int i = 0; i < world_size; i++) {
@@ -2169,7 +2169,8 @@ void partition_mesh(
     
     for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) {
         int source_rank = ghost_elem_owner_ranks[i];
-        elems_to_recv_by_rank[source_rank].push_back(static_cast<int>(i));
+        int ghost_elem_local_id = final_mesh.num_owned_elems + i;
+        elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id);
     }
 
     // ========== Serialize into a DRaggedRightArrayKokkos ==========
@@ -2177,6 +2178,7 @@ void partition_mesh(
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
         int source_rank = element_communication_plan.recv_rank_ids.host(i);
         elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
+       
     }
     DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
     // Fill in the data
@@ -2188,6 +2190,21 @@ void partition_mesh(
     }
     elems_to_recv_by_rank_rr.update_device();
 
+    // Debug: Print send vs recv counts per neighbor to diagnose mismatch
+    if (print_info) {
+        std::cout << "[rank " << rank << "] Send/Recv count comparison:" << std::endl;
+        for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+            int dest_rank = element_communication_plan.send_rank_ids.host(i);
+            int send_count = elems_to_send_by_rank_rr.stride_host(i);
+            std::cout << "  To rank " << dest_rank << ": sending " << send_count << " elements" << std::endl;
+        }
+        for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+            int src_rank = element_communication_plan.recv_rank_ids.host(i);
+            int recv_count = elems_to_recv_by_rank_rr.stride_host(i);
+            std::cout << "  From rank " << src_rank << ": expecting " << recv_count << " elements" << std::endl;
+        }
+    }
+
     element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
 
     MPI_Barrier(MPI_COMM_WORLD);
@@ -2200,13 +2217,12 @@ void partition_mesh(
     // Gauss points share the same communication plan as elements.
     // This test initializes gauss point fields on owned elements and exchanges them with ghost elements.
 
-    print_info = true;  // Enable debug output for communication test
-
-    gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}); // , &element_communication_plan
+    gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}, element_communication_plan); // , &element_communication_plan
 
     // Initialize the gauss point fields on each rank
     // Set owned elements to rank number, ghost elements to -1 (to verify communication)
     for (int i = 0; i < final_mesh.num_owned_elems; i++) {
+        // if(rank == 0) std::cout << " Setting owned element " << i << " to rank " << rank << std::endl;
         gauss_point.fields.host(i) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
@@ -2214,136 +2230,8 @@ void partition_mesh(
     }
     gauss_point.fields.update_device();
 
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to build the send buffer for element communication" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-    // ========== Build send buffer organized by destination rank ==========
-    std::vector<double> elem_send_buffer(element_communication_plan.total_send_count);
-    int send_idx = 0;
-    
-    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
-        // Get the number of elements to send to this neighbor
-        size_t num_elems_to_send = elems_to_send_by_rank_rr.stride_host(i);
-        if(rank == 0) std::cout << " Sending " << num_elems_to_send << " elements to rank " << element_communication_plan.send_rank_ids.host(i) << std::endl;
-        
-        for (size_t j = 0; j < num_elems_to_send; j++) {
-            int elem_lid = elems_to_send_by_rank_rr.host(i, j);
-            if(rank == 0) std::cout << " Sending element " << elem_lid << std::endl;
-            double value = gauss_point.fields.host(elem_lid);
-
-            if(rank == 0) std::cout << " Value: " << value << std::endl;
-            elem_send_buffer[send_idx++] = value;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished building the send buffer for element communication" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-    // ========== Allocate receive buffer ==========
-    std::vector<double> elem_recv_buffer(element_communication_plan.total_recv_count);
-
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished building the receive buffer for element communication" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
-   
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to exchange element data using MPI_Neighbor_alltoallv" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-
-    
-    MPI_Neighbor_alltoallv(
-        elem_send_buffer.data(),   // Send buffer with boundary element data
-        element_communication_plan.send_counts_.host_pointer(),           // Number of elements to send to each outgoing neighbor [outdegree]
-        element_communication_plan.send_displs_.host_pointer(),           // Displacement in send buffer for each outgoing neighbor [outdegree]
-        MPI_DOUBLE,                // Send data type
-        elem_recv_buffer.data(),   // Receive buffer for ghost element data
-        element_communication_plan.recv_counts_.host_pointer(),           // Number of elements to receive from each incoming neighbor [indegree]
-        element_communication_plan.recv_displs_.host_pointer(),           // Displacement in recv buffer for each incoming neighbor [indegree]
-        MPI_DOUBLE,                // Receive data type
-        element_communication_plan.mpi_comm_graph                 // Distributed graph communicator
-    );
-    
-    // ========== Update ghost element fields from receive buffer ==========
-    // Unpack received data back into ghost elements in the correct order
-    
-    // Track which ghost elements have been updated for debugging
-    std::vector<bool> ghost_updated(final_mesh.num_ghost_elems, false);
-    
-    int recv_idx = 0;
-    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
-        // Get the number of ghost elements from this source rank
-        size_t num_ghosts_from_source = elems_to_recv_by_rank_rr.stride_host(i);
-        
-        for (size_t j = 0; j < num_ghosts_from_source; j++) {
-            int ghost_idx = elems_to_recv_by_rank_rr.host(i, j);
-            int ghost_elem_local_id = final_mesh.num_owned_elems + ghost_idx;
-            gauss_point.fields.host(ghost_elem_local_id) = elem_recv_buffer[recv_idx++];  
-            ghost_updated[ghost_idx] = true;
-        }
-    }
+    gauss_point.fields.communicate();
     
-    // Debug: Check which ghosts weren't updated
-    if (print_info) {
-        std::vector<int> missing_ghosts;
-        for (size_t i = 0; i < ghost_updated.size(); i++) {
-            if (!ghost_updated[i]) {
-                missing_ghosts.push_back(static_cast<int>(i));
-            }
-        }
-        
-        if (!missing_ghosts.empty()) {
-            for (int r = 0; r < world_size; ++r) {
-                MPI_Barrier(MPI_COMM_WORLD);
-                if (rank == r) {
-                    std::cout << "[rank " << rank << "] WARNING: " << missing_ghosts.size() 
-                              << " ghost elements not in elems_to_recv_by_rank: ";
-                    for (size_t i = 0; i < std::min(missing_ghosts.size(), size_t(10)); i++) {
-                        std::cout << missing_ghosts[i] << " ";
-                    }
-                    if (missing_ghosts.size() > 10) std::cout << "...";
-                    std::cout << std::endl;
-                }
-                MPI_Barrier(MPI_COMM_WORLD);
-            }
-        }
-    }
-    
-    gauss_point.fields.update_device();
-    
-    // ========== Verify the communication worked correctly ==========
-    bool comm_test_passed = true;
-    for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
-        if (gauss_point.fields.host(i) < 0.0) {
-            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << i 
-                      << " was not updated (value = " << gauss_point.fields.host(i) << ")" << std::endl;
-            comm_test_passed = false;
-        }
-    }
-    
-    int local_test_passed = comm_test_passed ? 1 : 0;
-    int global_test_passed = 0;
-    MPI_Allreduce(&local_test_passed, &global_test_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if (rank == 0) {
-        if (global_test_passed) {
-            std::cout << "\n✓ Element communication test PASSED on all ranks\n" << std::endl;
-        } else {
-            std::cout << "\n✗ Element communication test FAILED on one or more ranks\n" << std::endl;
-        }
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    print_info = false;  // Disable debug output after communication test
-
     // Loop over all elements and average the values of elements connected to that element
     for (int i = 0; i < final_mesh.num_elems; i++) {
         double value = 0.0;
@@ -2355,112 +2243,112 @@ void partition_mesh(
     }
     gauss_point.fields.update_device();
 
-   
 
-// --------------------------------------------------------------------------------------
+
+    // --------------------------------------------------------------------------------------
 // Build the send pattern for nodes
-// Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
-// Steps:
-// 1) Each rank contributes its ghost node GIDs.
-// 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
-// 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
-// --------------------------------------------------------------------------------------
-    
-    std::vector<std::vector<std::pair<int, size_t>>> boundary_node_targets(intermediate_mesh.num_nodes);
-    
-    // Prepare local ghost node list as vector
-    std::vector<size_t> ghost_node_gids_vec;
-    ghost_node_gids_vec.reserve(ghost_only_nodes.size());
-    for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g);
-    
-    // Exchange counts
-    std::vector<int> ghost_node_counts(world_size, 0);
-    int local_ghost_node_count = static_cast<int>(ghost_node_gids_vec.size());
-    MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    
-    // Displacements and recv buffer
-    std::vector<int> ghost_node_displs(world_size, 0);
-    int total_ghost_nodes = 0;
-    for (int r = 0; r < world_size; ++r) {
-        ghost_node_displs[r] = total_ghost_nodes;
-        total_ghost_nodes += ghost_node_counts[r];
-    }
-    std::vector<size_t> all_ghost_node_gids(total_ghost_nodes);
-    
-    // Gather ghost node gids
-    MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG,
-                   all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl;
-    
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl;
-    
-    // Build map node_gid -> ranks that ghost it
-    std::unordered_map<size_t, std::vector<int>> node_gid_to_ghosting_ranks;
-    node_gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghost_nodes));
-    for (int r = 0; r < world_size; ++r) {
-        int cnt = ghost_node_counts[r];
-        int off = ghost_node_displs[r];
-        for (int i = 0; i < cnt; ++i) {
-            size_t g = all_ghost_node_gids[off + i];
-            node_gid_to_ghosting_ranks[g].push_back(r);
-        }
-    }
+    // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost node GIDs.
+    // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    
+    // std::vector<std::vector<std::pair<int, size_t>>> boundary_node_targets(intermediate_mesh.num_nodes);
+    
+    // // Prepare local ghost node list as vector
+    // std::vector<size_t> ghost_node_gids_vec;
+    // ghost_node_gids_vec.reserve(ghost_only_nodes.size());
+    // for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g);
+    
+    // // Exchange counts
+    // std::vector<int> ghost_node_counts(world_size, 0);
+    // int local_ghost_node_count = static_cast<int>(ghost_node_gids_vec.size());
+    // MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    
+    // // Displacements and recv buffer
+    // std::vector<int> ghost_node_displs(world_size, 0);
+    // int total_ghost_nodes = 0;
+    // for (int r = 0; r < world_size; ++r) {
+    //     ghost_node_displs[r] = total_ghost_nodes;
+    //     total_ghost_nodes += ghost_node_counts[r];
+    // }
+    // std::vector<size_t> all_ghost_node_gids(total_ghost_nodes);
+    
+    // // Gather ghost node gids
+    // MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG,
+    //                all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(),
+    //                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl;
+    
+    
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl;
+    
+    // // Build map node_gid -> ranks that ghost it
+    // std::unordered_map<size_t, std::vector<int>> node_gid_to_ghosting_ranks;
+    // node_gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghost_nodes));
+    // for (int r = 0; r < world_size; ++r) {
+    //     int cnt = ghost_node_counts[r];
+    //     int off = ghost_node_displs[r];
+    //     for (int i = 0; i < cnt; ++i) {
+    //         size_t g = all_ghost_node_gids[off + i];
+    //         node_gid_to_ghosting_ranks[g].push_back(r);
+    //     }
+    // }
     
-    // For each local node, list destinations: ranks that ghost our node gid
-    for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
-        size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-        auto it = node_gid_to_ghosting_ranks.find(local_node_gid);
-        if (it == node_gid_to_ghosting_ranks.end()) continue;
-        const std::vector<int> &dest_ranks = it->second;
-        for (int rr : dest_ranks) {
-            if (rr == rank) continue;
-            boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid));
-        }
-    }
+    // // For each local node, list destinations: ranks that ghost our node gid
+    // for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
+    //     size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+    //     auto it = node_gid_to_ghosting_ranks.find(local_node_gid);
+    //     if (it == node_gid_to_ghosting_ranks.end()) continue;
+    //     const std::vector<int> &dest_ranks = it->second;
+    //     for (int rr : dest_ranks) {
+    //         if (rr == rank) continue;
+    //         boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid));
+    //     }
+    // }
     
-    std::cout.flush();
-    MPI_Barrier(MPI_COMM_WORLD);
-    print_info = false;
+    // std::cout.flush();
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // print_info = false;
     
-    // Optional: print a compact summary of node reverse map for verification (limited output)
-    for(int i = 0; i < world_size; i++) {
-        if (rank == i && print_info) {
-            std::cout << std::endl;
-            for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
+    // // Optional: print a compact summary of node reverse map for verification (limited output)
+    // for(int i = 0; i < world_size; i++) {
+    //     if (rank == i && print_info) {
+    //         std::cout << std::endl;
+    //         for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
                 
-                size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-                if (boundary_node_targets[node_lid].empty()) 
-                {
-                    std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl;
-                }
-                else
-                {
-                    std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: ";
-                    int shown = 0;
-                    for (const auto &pr : boundary_node_targets[node_lid]) {
-                        if (shown >= 12) { std::cout << " ..."; break; }
-                        std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
-                        shown++;
-                    }
-                    std::cout << std::endl;
-                }
-            }
-            std::cout.flush();
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    //             size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+    //             if (boundary_node_targets[node_lid].empty()) 
+    //             {
+    //                 std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl;
+    //             }
+    //             else
+    //             {
+    //                 std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: ";
+    //                 int shown = 0;
+    //                 for (const auto &pr : boundary_node_targets[node_lid]) {
+    //                     if (shown >= 12) { std::cout << " ..."; break; }
+    //                     std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
+    //                     shown++;
+    //                 }
+    //                 std::cout << std::endl;
+    //             }
+    //         }
+    //         std::cout.flush();
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
     
-    print_info = false;
+    // print_info = false;
     
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
+
 
-    
 
 
 }
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index e1383ccb..b14ee9cd 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {20, 20, 20};
+    int num_elems_dim[3] = {50, 50, 50};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index 636a1bd0..54766d31 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -15,6 +15,9 @@ class MPICArrayKokkos {
 
     // Dual view for managing data on both CPU and GPU
     DCArrayKokkos<T> this_array_;
+
+    DCArrayKokkos<T> send_buffer_;
+    DCArrayKokkos<T> recv_buffer_;
     
 protected:
     size_t dims_[7];
@@ -97,12 +100,16 @@ class MPICArrayKokkos {
                   size_t n, size_t o) const;
     
     KOKKOS_INLINE_FUNCTION
-    MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp);
+    MPICArrayKokkos& operator=(const MPICArrayKokkos& temp);
 
 
     // Method to set comm plan
     KOKKOS_INLINE_FUNCTION
-    void initialize_comm_plan(CommunicationPlan* comm_plan);
+    void initialize_comm_plan(CommunicationPlan& comm_plan){
+        comm_plan_ = &comm_plan;
+        send_buffer_ = DCArrayKokkos<T>(comm_plan_->total_send_count, "send_buffer");
+        recv_buffer_ = DCArrayKokkos<T>(comm_plan_->total_recv_count, "recv_buffer");
+    };
 
 
 
@@ -132,7 +139,7 @@ class MPICArrayKokkos {
 
     // Method returns kokkos dual view
     KOKKOS_INLINE_FUNCTION
-    TArray1D get_kokkos_dual_view() const;
+    Kokkos::DualView<T*, Layout, ExecSpace, MemoryTraits> get_kokkos_dual_view() const;
 
     // Method that update host view
     void update_host();
@@ -140,11 +147,174 @@ class MPICArrayKokkos {
     // Method that update device view
     void update_device();
 
+    // Method that builds the send buffer
+    void fill_send_buffer(){
+
+        int rank;
+        MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank);
+
+        // this_array_.update_host();
+        int send_idx = 0;
+        for(int i = 0; i < comm_plan_->num_send_ranks; i++){
+            for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
+                int src_idx = comm_plan_->send_indices_.host(i, j);
+                send_buffer_.host(send_idx) = this_array_.host(src_idx);
+                if(rank == 0) std::cout << "MPICArrayKokkos::fill_send_buffer() - send_buffer(" << send_idx << ") = " << this_array_.host(src_idx) << std::endl;
+                send_idx++;
+            }
+        }
+    };
+
+    // Method that copies the recv buffer
+    void copy_recv_buffer(){
+        int rank;
+        MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank);
+
+        // NOTE: Do NOT call recv_buffer_.update_host() here!
+        // MPI already wrote directly to host memory, so calling update_host()
+        // would overwrite the received data by copying stale device data
+        int recv_idx = 0;
+        for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
+            for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){
+                int dest_idx = comm_plan_->recv_indices_.host(i, j);
+                this_array_.host(dest_idx) = recv_buffer_.host(recv_idx);
+                //if(rank == 0) std::cout << "MPICArrayKokkos::copy_recv_buffer() - this_array(" << dest_idx << ") = " << recv_buffer_.host(recv_idx) << std::endl;
+                recv_idx++;
+            }
+        }
+    };
+
+    void communicate(){
+        int rank;
+        MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank);
+        
+        if(rank == 0) {
+            std::cout << "MPICArrayKokkos::communicate() - this_array size: " << this_array_.size() << std::endl;
+            std::cout << "MPICArrayKokkos::communicate() - send_buffer size: " << send_buffer_.size() 
+                      << ", recv_buffer size: " << recv_buffer_.size() << std::endl;
+            std::cout << "MPICArrayKokkos::communicate() - total_send_count: " << comm_plan_->total_send_count 
+                      << ", total_recv_count: " << comm_plan_->total_recv_count << std::endl;
+        }
+        
+        fill_send_buffer();
+
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() - Starting MPI_Neighbor_alltoallv" << std::endl;
+
+
+        MPI_Barrier(comm_plan_->mpi_comm_world);
+        
+        // Verify buffer sizes match expected
+        if(rank == 0) {
+            std::cout << "Send buffer size check: " << send_buffer_.size() << " vs expected " << comm_plan_->total_send_count << std::endl;
+            std::cout << "Recv buffer size check: " << recv_buffer_.size() << " vs expected " << comm_plan_->total_recv_count << std::endl;
+            
+            // Print first few send values
+            std::cout << "MPICArrayKokkos::communicate() - send_buffer values: ";
+            for(int i = 0; i < 10 && i < send_buffer_.size(); i++) {
+                std::cout << send_buffer_.host(i) << " ";
+            }
+            std::cout << std::endl;
+            
+            // Print send counts and displs
+            std::cout << "Send counts: ";
+            int total_send = 0;
+            for(int i = 0; i < comm_plan_->num_send_ranks; i++) {
+                int count = comm_plan_->send_counts_.host(i);
+                std::cout << count << " ";
+                total_send += count;
+            }
+            std::cout << "(total=" << total_send << ")" << std::endl;
+            
+            std::cout << "Send displs: ";
+            for(int i = 0; i < comm_plan_->num_send_ranks; i++) {
+                std::cout << comm_plan_->send_displs_.host(i) << " ";
+            }
+            std::cout << std::endl;
+            
+            // Print recv counts and displs
+            std::cout << "Recv counts: ";
+            int total_recv = 0;
+            for(int i = 0; i < comm_plan_->num_recv_ranks; i++) {
+                int count = comm_plan_->recv_counts_.host(i);
+                std::cout << count << " ";
+                total_recv += count;
+            }
+            std::cout << "(total=" << total_recv << ")" << std::endl;
+            
+            std::cout << "Recv displs: ";
+            for(int i = 0; i < comm_plan_->num_recv_ranks; i++) {
+                std::cout << comm_plan_->recv_displs_.host(i) << " ";
+            }
+            std::cout << std::endl;
+        }
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() calling MPI_Neighbor_alltoallv"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        // CRITICAL: Get all pointers BEFORE the MPI call and store them in local stack variables
+        // This prevents Kokkos from deallocating during the MPI call
+        // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior)
+        T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr;
+        T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr;
+        int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr;
+        int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr;
+        int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr;
+        int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
+        
+        if(rank == 0) {
+            std::cout << "Pointer addresses:" << std::endl;
+            std::cout << "  send_buf_ptr = " << (void*)send_buf_ptr << std::endl;
+            std::cout << "  send_cnt_ptr = " << (void*)send_cnt_ptr << std::endl;
+            std::cout << "  send_dsp_ptr = " << (void*)send_dsp_ptr << std::endl;
+            std::cout << "  recv_buf_ptr = " << (void*)recv_buf_ptr << std::endl;
+            std::cout << "  recv_cnt_ptr = " << (void*)recv_cnt_ptr << std::endl;
+            std::cout << "  recv_dsp_ptr = " << (void*)recv_dsp_ptr << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        MPI_Neighbor_alltoallv(
+            &send_buffer_.host(0),
+            &comm_plan_->send_counts_.host(0),
+            &comm_plan_->send_displs_.host(0),
+            MPI_DOUBLE,
+            &recv_buffer_.host(0),
+            &comm_plan_->recv_counts_.host(0),
+            &comm_plan_->recv_displs_.host(0), 
+            MPI_DOUBLE, 
+            comm_plan_->mpi_comm_graph);
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished MPI_Neighbor_alltoallv"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() about to copy recv buffer"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        copy_recv_buffer();
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished copying recv buffer"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() about to update device"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        //this_array_.update_device();  // Commented out - not needed since nothing runs on device
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished updating device (skipped)"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() about to return"<<std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+    };
+
     
 
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
-    ~MPIDArrayKokkos ();
+    ~MPICArrayKokkos ();
 }; // End of MPIDArrayKokkos
 
 
@@ -158,49 +328,49 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0);
 }
 
 // Overloaded 2D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1);
 }
 
 // Overloaded 3D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2);
 }
 
 // Overloaded 4D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3);
 }
 
 // Overloaded 5D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4);
 }
 
 // Overloaded 6D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5);
 }
 
 // Overloaded 7D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
-    host = ViewCArray <T> (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
 }
 
 
@@ -283,8 +453,12 @@ T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-MPIDArrayKokkos& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPIDArrayKokkos& temp) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPICArrayKokkos& temp) {
     this_array_ = temp.this_array_;
+    host = temp.host;  // Also copy the host ViewCArray
+    comm_plan_ = temp.comm_plan_;
+    send_buffer_ = temp.send_buffer_;
+    recv_buffer_ = temp.recv_buffer_;
     return *this;
 }
 
@@ -346,7 +520,8 @@ void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
-    this_array_.~DCArrayKokkos();
+    // Member variables (this_array_, send_buffer_, recv_buffer_) are automatically
+    // destroyed by the compiler - no explicit cleanup needed
 }
 
 #endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 01f54624..b1ad58a4 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -35,6 +35,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define STATE_H
 
 #include "matar.h"
+#include "mpi_type.h"
 
 using namespace mtr;
 
@@ -95,17 +96,24 @@ enum class gauss_pt_state
 struct GaussPoint_t
 {
 
-    DCArrayKokkos<double> fields;  ///< GaussPoint fields
+    //DCArrayKokkos<double> fields;  ///< GaussPoint fields
+
+
+    MPICArrayKokkos<double> fields;
     
 
     // initialization method (num_cells, num_dims)
-    void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector<gauss_pt_state> gauss_pt_states)
+    void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector<gauss_pt_state> gauss_pt_states, CommunicationPlan& comm_plan)
     {
 
         for (auto field : gauss_pt_states){
             switch(field){
                 case gauss_pt_state::fields:
-                    if (fields.size() == 0) this->fields = DCArrayKokkos<double>(num_gauss_pnts, "gauss_point_fields");
+                    //if (fields.size() == 0) this->fields = DCArrayKokkos<double>(num_gauss_pnts, "gauss_point_fields");
+                    if (fields.size() == 0){
+                        this->fields = MPICArrayKokkos<double>(num_gauss_pnts, "gauss_point_fields");
+                        this->fields.initialize_comm_plan(comm_plan);
+                    } 
                     break;
                 default:
                     std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"<<std::endl;

From 0c26ed0f68d2be6a87bdb6e03d8e5f459f6d7916 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Thu, 6 Nov 2025 15:56:37 -0600
Subject: [PATCH 21/52] ENH: Testing multi-dimensional MPICArrayKokkos types,
 working

---
 examples/mesh_decomp/decomp_utils.h |  54 +++--
 examples/mesh_decomp/mesh_io.h      |  32 ++-
 examples/mesh_decomp/mpi_type.h     | 321 ++++++++++++++--------------
 examples/mesh_decomp/state.h        |  12 +-
 4 files changed, 244 insertions(+), 175 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 49e1113e..e3421259 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -1812,9 +1812,9 @@ void partition_mesh(
     }
 
     // 2. Build owned node GIDs and their coordinates
-    std::vector<size_t> owned_gids(intermediate_mesh.num_nodes);
-    for (int i = 0; i < owned_gids.size(); ++i)
-        owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i);
+    std::vector<size_t> owned_gids(final_mesh.num_owned_nodes);
+    for (int i = 0; i < final_mesh.num_owned_nodes; ++i)
+        owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
 
      // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
      //    so we can distribute the needed coordinate data.
@@ -1901,16 +1901,18 @@ void partition_mesh(
     // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
     // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
     // --------------------------------------------------------------------------------------
-    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(intermediate_mesh.num_elems);
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(final_mesh.num_owned_elems);
 
     // Prepare local ghost list as vector
     std::vector<size_t> ghost_gids_vec;
-    ghost_gids_vec.reserve(ghost_elem_gids.size());
-    for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g);
+    ghost_gids_vec.reserve(final_mesh.num_ghost_elems);
+    for (int i = 0; i < final_mesh.num_ghost_elems; ++i) {
+        ghost_gids_vec.push_back(final_mesh.local_to_global_elem_mapping.host(final_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping
+    }
 
     // Exchange counts
     std::vector<int> ghost_counts(world_size, 0);
-    int local_ghost_count = static_cast<int>(ghost_gids_vec.size());
+    int local_ghost_count = final_mesh.num_ghost_elems;
     MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
 
     // Displacements and recv buffer
@@ -1947,8 +1949,8 @@ void partition_mesh(
     }
 
     // For each local element, list destinations: ranks that ghost our gid
-    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
-        size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
+    for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) {
+        size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
         auto it = gid_to_ghosting_ranks.find(local_elem_gid);
         if (it == gid_to_ghosting_ranks.end()) continue;
         const std::vector<int> &dest_ranks = it->second;
@@ -1966,9 +1968,9 @@ void partition_mesh(
     for(int i = 0; i < world_size; i++) {
         if (rank == i && print_info) {
             std::cout << std::endl;
-            for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+            for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) {
 
-                size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
+                size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
                 if (boundary_elem_targets[elem_lid].empty()) 
                 {
                     std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl;
@@ -1997,9 +1999,9 @@ void partition_mesh(
     std::set<int> ghost_comm_ranks; // set of ranks that this rank communicates with
     
 
-    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
+    for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) {
 
-        int local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid);
+        int local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
         if (boundary_elem_targets[elem_lid].empty()) 
         {
             continue;
@@ -2217,20 +2219,30 @@ void partition_mesh(
     // Gauss points share the same communication plan as elements.
     // This test initializes gauss point fields on owned elements and exchanges them with ghost elements.
 
-    gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}, element_communication_plan); // , &element_communication_plan
+    std::vector<gauss_pt_state> gauss_pt_states = {gauss_pt_state::fields, gauss_pt_state::fields_vec};
+
+    gauss_point.initialize(final_mesh.num_elems, final_mesh.num_dims, gauss_pt_states, element_communication_plan); // , &element_communication_plan
 
     // Initialize the gauss point fields on each rank
     // Set owned elements to rank number, ghost elements to -1 (to verify communication)
     for (int i = 0; i < final_mesh.num_owned_elems; i++) {
         // if(rank == 0) std::cout << " Setting owned element " << i << " to rank " << rank << std::endl;
         gauss_point.fields.host(i) = static_cast<double>(rank);
+        gauss_point.fields_vec.host(i, 0) = static_cast<double>(rank);
+        gauss_point.fields_vec.host(i, 1) = static_cast<double>(rank);
+        gauss_point.fields_vec.host(i, 2) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
         gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
+        gauss_point.fields_vec.host(i, 0) = -1.0;
+        gauss_point.fields_vec.host(i, 1) = -1.0;
+        gauss_point.fields_vec.host(i, 2) = -1.0;
     }
     gauss_point.fields.update_device();
-
+    gauss_point.fields_vec.update_device();
+    
     gauss_point.fields.communicate();
+    gauss_point.fields_vec.communicate();
     
     // Loop over all elements and average the values of elements connected to that element
     for (int i = 0; i < final_mesh.num_elems; i++) {
@@ -2241,7 +2253,17 @@ void partition_mesh(
         value /= final_mesh.num_elems_in_elem(i);
         gauss_point.fields.host(i) = value;
     }
-    gauss_point.fields.update_device();
+    for (int i = 0; i < final_mesh.num_elems; i++) {
+        double value = 0.0;
+        for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
+            value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0);
+        }
+        value /= final_mesh.num_elems_in_elem(i);
+        gauss_point.fields_vec.host(i, 0) = value;
+        gauss_point.fields_vec.host(i, 1) = value;
+        gauss_point.fields_vec.host(i, 2) = value;
+    }
+    gauss_point.fields_vec.update_device();
 
 
 
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 77dac8d0..8170e531 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -513,7 +513,7 @@ void write_vtu(Mesh_t& mesh,
     Kokkos::fence();
 
     const int num_cell_scalar_vars = 4;
-    const int num_cell_vec_vars    = 0;
+    const int num_cell_vec_vars    = 1;
     const int num_cell_tensor_vars = 0;
 
     const int num_point_scalar_vars = 3;
@@ -524,6 +524,10 @@ void write_vtu(Mesh_t& mesh,
         "rank_id", "elems_in_elem_owned", "global_elem_id", "field_value"
     };
 
+    const char cell_vec_var_names[num_cell_vec_vars][15] = {
+        "field_vec"
+    };
+
     const char point_scalar_var_names[num_point_scalar_vars][15] = {
         "rank_id", "elems_in_node", "global_node_id"
     };
@@ -539,12 +543,16 @@ void write_vtu(Mesh_t& mesh,
 
     // save the cell state to an array for exporting to graphics files
     auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
-
+    auto elem_vec_fields = CArray<double>(num_elems, num_cell_vec_vars, 3);
+    
     for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
         elem_fields(elem_gid, 0) = rank;
         elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
         elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
         elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid);
+        elem_vec_fields(elem_gid, 0, 0) = gauss_point.fields_vec.host(elem_gid, 0);
+        elem_vec_fields(elem_gid, 0, 1) = gauss_point.fields_vec.host(elem_gid, 1);
+        elem_vec_fields(elem_gid, 0, 2) = gauss_point.fields_vec.host(elem_gid, 2);
     }
 
     // save the vertex vector fields to an array for exporting to graphics files
@@ -670,6 +678,22 @@ void write_vtu(Mesh_t& mesh,
 
     // Write CellData (element fields)
     fprintf(vtu_file, "      <CellData>\n");
+    
+    // Cell vector variables
+    for (int var = 0; var < num_cell_vec_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\" format=\"ascii\">\n", 
+                cell_vec_var_names[var]);
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            // TODO: Populate cell vector field data from appropriate source
+            fprintf(vtu_file, "          %f %f %f\n", 
+                gauss_point.fields_vec.host(elem_gid, 0), 
+                gauss_point.fields_vec.host(elem_gid, 1), 
+                gauss_point.fields_vec.host(elem_gid, 2));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+    
+    // Cell scalar variables
     for (int var = 0; var < num_cell_scalar_vars; var++) {
         fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", 
                 cell_scalar_var_names[var]);
@@ -730,6 +754,10 @@ void write_vtu(Mesh_t& mesh,
 
         // Write PCellData
         fprintf(pvtu_file, "    <PCellData>\n");
+        for (int var = 0; var < num_cell_vec_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\"/>\n",
+                    cell_vec_var_names[var]);
+        }
         for (int var = 0; var < num_cell_scalar_vars; var++) {
             fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\"/>\n",
                     cell_scalar_var_names[var]);
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index 54766d31..858705d7 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -6,6 +6,69 @@
 
 using namespace mtr;
 
+// Add this before the MPICArrayKokkos class definition
+
+// Type trait to map C++ types to MPI_Datatype
+template <typename T>
+struct mpi_type_map {
+    static MPI_Datatype value() {
+        static_assert(sizeof(T) == 0, "Unsupported type for MPI communication");
+        return MPI_DATATYPE_NULL;
+    }
+};
+
+// Specializations for common types
+template <>
+struct mpi_type_map<int> {
+    static MPI_Datatype value() { return MPI_INT; }
+};
+
+template <>
+struct mpi_type_map<long> {
+    static MPI_Datatype value() { return MPI_LONG; }
+};
+
+template <>
+struct mpi_type_map<long long> {
+    static MPI_Datatype value() { return MPI_LONG_LONG; }
+};
+
+template <>
+struct mpi_type_map<unsigned int> {
+    static MPI_Datatype value() { return MPI_UNSIGNED; }
+};
+
+template <>
+struct mpi_type_map<unsigned long> {
+    static MPI_Datatype value() { return MPI_UNSIGNED_LONG; }
+};
+
+template <>
+struct mpi_type_map<float> {
+    static MPI_Datatype value() { return MPI_FLOAT; }
+};
+
+template <>
+struct mpi_type_map<double> {
+    static MPI_Datatype value() { return MPI_DOUBLE; }
+};
+
+template <>
+struct mpi_type_map<char> {
+    static MPI_Datatype value() { return MPI_CHAR; }
+};
+
+template <>
+struct mpi_type_map<unsigned char> {
+    static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; }
+};
+
+template <>
+struct mpi_type_map<bool> {
+    static MPI_Datatype value() { return MPI_C_BOOL; }
+};
+
+
 /////////////////////////
 // MPICArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
 // 
@@ -39,6 +102,8 @@ class MPICArrayKokkos {
     DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
     DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
 
+    size_t stride_; // [size: num_dims] Number of contiguous values per first index element
+
 
     DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank
     DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank
@@ -46,8 +111,6 @@ class MPICArrayKokkos {
     
     size_t num_owned_;            // Number of owned items (nodes/elements)
     size_t num_ghost_;            // Number of ghost items (nodes/elements)
-    
-    void set_mpi_type();
 
 public:
     // Data member to access host view
@@ -104,13 +167,43 @@ class MPICArrayKokkos {
 
 
     // Method to set comm plan
-    KOKKOS_INLINE_FUNCTION
     void initialize_comm_plan(CommunicationPlan& comm_plan){
         comm_plan_ = &comm_plan;
-        send_buffer_ = DCArrayKokkos<T>(comm_plan_->total_send_count, "send_buffer");
-        recv_buffer_ = DCArrayKokkos<T>(comm_plan_->total_recv_count, "recv_buffer");
-    };
+        
+        size_t send_size = comm_plan_->total_send_count * stride_;
+        size_t recv_size = comm_plan_->total_recv_count * stride_;
+        
+        if (send_size > 0) {
+            send_buffer_ = DCArrayKokkos<T>(send_size, "send_buffer");
+        }
+        if (recv_size > 0) {
+            recv_buffer_ = DCArrayKokkos<T>(recv_size, "recv_buffer");
+        }
 
+        if (comm_plan_->num_send_ranks > 0) {
+            send_counts_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_counts");
+            send_displs_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_displs");
+            
+            for(int i = 0; i < comm_plan_->num_send_ranks; i++){
+                send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_;
+                send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_;
+            }
+            send_counts_.update_device();
+            send_displs_.update_device();
+        }
+        
+        if (comm_plan_->num_recv_ranks > 0) {
+            recv_counts_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_counts");
+            recv_displs_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_displs");
+            
+            for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
+                recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_;
+                recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_;
+            }
+            recv_counts_.update_device();
+            recv_displs_.update_device();
+        }
+    };
 
 
     // GPU Method
@@ -147,228 +240,145 @@ class MPICArrayKokkos {
     // Method that update device view
     void update_device();
 
-    // Method that builds the send buffer
+    // Method that builds the send buffer, note, this has to be ordered
+    // Such that all the boundary elements going to a given rank are contiguous in the send buffer.
     void fill_send_buffer(){
 
-        int rank;
-        MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank);
 
-        // this_array_.update_host();
-        int send_idx = 0;
+      
+        T* src_ptr = this_array_.host_pointer();
+
+        
+        size_t send_idx = 0;
         for(int i = 0; i < comm_plan_->num_send_ranks; i++){
             for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
-                int src_idx = comm_plan_->send_indices_.host(i, j);
-                send_buffer_.host(send_idx) = this_array_.host(src_idx);
-                if(rank == 0) std::cout << "MPICArrayKokkos::fill_send_buffer() - send_buffer(" << send_idx << ") = " << this_array_.host(src_idx) << std::endl;
-                send_idx++;
+                size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send
+                
+                // Copy all values associated with this element (handles multi-dimensional arrays)
+                for(size_t k = 0; k < stride_; k++){
+                    send_buffer_.host(send_idx + k) = src_ptr[src_idx * stride_ + k];
+                }
+                send_idx += stride_;
             }
         }
     };
 
-    // Method that copies the recv buffer
+    // Method that copies the recv buffer into the this_array
     void copy_recv_buffer(){
-        int rank;
-        MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank);
-
-        // NOTE: Do NOT call recv_buffer_.update_host() here!
-        // MPI already wrote directly to host memory, so calling update_host()
-        // would overwrite the received data by copying stale device data
-        int recv_idx = 0;
+        
+        T* dest_ptr = this_array_.host_pointer();
+        
+        size_t recv_idx = 0;
         for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
             for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){
-                int dest_idx = comm_plan_->recv_indices_.host(i, j);
-                this_array_.host(dest_idx) = recv_buffer_.host(recv_idx);
-                //if(rank == 0) std::cout << "MPICArrayKokkos::copy_recv_buffer() - this_array(" << dest_idx << ") = " << recv_buffer_.host(recv_idx) << std::endl;
-                recv_idx++;
+                size_t dest_idx = comm_plan_->recv_indices_.host(i, j);
+                
+                // Copy all values associated with this element (handles multi-dimensional arrays)
+                for(size_t k = 0; k < stride_; k++){
+                    dest_ptr[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k);
+                }
+                
+                recv_idx += stride_;
             }
         }
+        this_array_.update_device();
     };
 
-    void communicate(){
-        int rank;
-        MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank);
-        
-        if(rank == 0) {
-            std::cout << "MPICArrayKokkos::communicate() - this_array size: " << this_array_.size() << std::endl;
-            std::cout << "MPICArrayKokkos::communicate() - send_buffer size: " << send_buffer_.size() 
-                      << ", recv_buffer size: " << recv_buffer_.size() << std::endl;
-            std::cout << "MPICArrayKokkos::communicate() - total_send_count: " << comm_plan_->total_send_count 
-                      << ", total_recv_count: " << comm_plan_->total_recv_count << std::endl;
-        }
-        
-        fill_send_buffer();
-
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() - Starting MPI_Neighbor_alltoallv" << std::endl;
-
 
-        MPI_Barrier(comm_plan_->mpi_comm_world);
-        
-        // Verify buffer sizes match expected
-        if(rank == 0) {
-            std::cout << "Send buffer size check: " << send_buffer_.size() << " vs expected " << comm_plan_->total_send_count << std::endl;
-            std::cout << "Recv buffer size check: " << recv_buffer_.size() << " vs expected " << comm_plan_->total_recv_count << std::endl;
-            
-            // Print first few send values
-            std::cout << "MPICArrayKokkos::communicate() - send_buffer values: ";
-            for(int i = 0; i < 10 && i < send_buffer_.size(); i++) {
-                std::cout << send_buffer_.host(i) << " ";
-            }
-            std::cout << std::endl;
-            
-            // Print send counts and displs
-            std::cout << "Send counts: ";
-            int total_send = 0;
-            for(int i = 0; i < comm_plan_->num_send_ranks; i++) {
-                int count = comm_plan_->send_counts_.host(i);
-                std::cout << count << " ";
-                total_send += count;
-            }
-            std::cout << "(total=" << total_send << ")" << std::endl;
-            
-            std::cout << "Send displs: ";
-            for(int i = 0; i < comm_plan_->num_send_ranks; i++) {
-                std::cout << comm_plan_->send_displs_.host(i) << " ";
-            }
-            std::cout << std::endl;
-            
-            // Print recv counts and displs
-            std::cout << "Recv counts: ";
-            int total_recv = 0;
-            for(int i = 0; i < comm_plan_->num_recv_ranks; i++) {
-                int count = comm_plan_->recv_counts_.host(i);
-                std::cout << count << " ";
-                total_recv += count;
-            }
-            std::cout << "(total=" << total_recv << ")" << std::endl;
-            
-            std::cout << "Recv displs: ";
-            for(int i = 0; i < comm_plan_->num_recv_ranks; i++) {
-                std::cout << comm_plan_->recv_displs_.host(i) << " ";
-            }
-            std::cout << std::endl;
-        }
-        
-        MPI_Barrier(MPI_COMM_WORLD);
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() calling MPI_Neighbor_alltoallv"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
-        
-        // CRITICAL: Get all pointers BEFORE the MPI call and store them in local stack variables
-        // This prevents Kokkos from deallocating during the MPI call
+    // Note: This "may" be needed, im not sure.  Currently, it works....
         // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior)
-        T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr;
-        T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr;
-        int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr;
-        int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr;
-        int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr;
-        int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
-        
-        if(rank == 0) {
-            std::cout << "Pointer addresses:" << std::endl;
-            std::cout << "  send_buf_ptr = " << (void*)send_buf_ptr << std::endl;
-            std::cout << "  send_cnt_ptr = " << (void*)send_cnt_ptr << std::endl;
-            std::cout << "  send_dsp_ptr = " << (void*)send_dsp_ptr << std::endl;
-            std::cout << "  recv_buf_ptr = " << (void*)recv_buf_ptr << std::endl;
-            std::cout << "  recv_cnt_ptr = " << (void*)recv_cnt_ptr << std::endl;
-            std::cout << "  recv_dsp_ptr = " << (void*)recv_dsp_ptr << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
+        // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr;
+        // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr;
+        // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr;
+        // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr;
+        // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr;
+        // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
+
+    // Method that communicates the data between the ranks
+    void communicate(){
+
+        this_array_.update_host();
+       
+        fill_send_buffer();
         
         MPI_Neighbor_alltoallv(
-            &send_buffer_.host(0),
-            &comm_plan_->send_counts_.host(0),
-            &comm_plan_->send_displs_.host(0),
-            MPI_DOUBLE,
-            &recv_buffer_.host(0),
-            &comm_plan_->recv_counts_.host(0),
-            &comm_plan_->recv_displs_.host(0), 
-            MPI_DOUBLE, 
+            send_buffer_.host_pointer(),
+            send_counts_.host_pointer(),
+            send_displs_.host_pointer(),
+            mpi_type_map<T>::value(),  // MPI_TYPE
+            recv_buffer_.host_pointer(),
+            recv_counts_.host_pointer(),
+            recv_displs_.host_pointer(), 
+            mpi_type_map<T>::value(),  // MPI_TYPE
             comm_plan_->mpi_comm_graph);
         
-        MPI_Barrier(MPI_COMM_WORLD);
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished MPI_Neighbor_alltoallv"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
-        
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() about to copy recv buffer"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
-        
         copy_recv_buffer();
 
-        MPI_Barrier(MPI_COMM_WORLD);
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished copying recv buffer"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
-        
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() about to update device"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
-
-        //this_array_.update_device();  // Commented out - not needed since nothing runs on device
-
-        MPI_Barrier(MPI_COMM_WORLD);
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished updating device (skipped)"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
-        
-        if(rank == 0) std::cout << "MPICArrayKokkos::communicate() about to return"<<std::endl;
-        MPI_Barrier(MPI_COMM_WORLD);
+        this_array_.update_device();
     };
 
-    
-
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
     ~MPICArrayKokkos ();
 }; // End of MPIDArrayKokkos
 
-
-
 // Default constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
-    : this_array_() { }
+    : this_array_(), stride_(1) { }
 
 // Overloaded 1D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) 
+    : stride_(1) {
     this_array_ = DCArrayKokkos<T>(dim0, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0);
 }
 
 // Overloaded 2D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) 
+    : stride_(dim1) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1);
 }
 
 // Overloaded 3D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) 
+    : stride_(dim1 * dim2) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2);
 }
 
 // Overloaded 4D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3);
 }
 
 // Overloaded 5D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4);
 }
 
 // Overloaded 6D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5);
 }
 
 // Overloaded 7D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6) {
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
 }
@@ -459,6 +469,7 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPICArrayKokkos<T,Layout,ExecS
     comm_plan_ = temp.comm_plan_;
     send_buffer_ = temp.send_buffer_;
     recv_buffer_ = temp.recv_buffer_;
+    stride_ = temp.stride_;
     return *this;
 }
 
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index b1ad58a4..556039da 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -83,7 +83,8 @@ struct node_t
 // Possible gauss point states, used to initialize GaussPoint_t
 enum class gauss_pt_state
 {
-    fields
+    fields,
+    fields_vec
 };
 
 /////////////////////////////////////////////////////////////////////////////
@@ -100,7 +101,8 @@ struct GaussPoint_t
 
 
     MPICArrayKokkos<double> fields;
-    
+
+    MPICArrayKokkos<double> fields_vec;
 
     // initialization method (num_cells, num_dims)
     void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector<gauss_pt_state> gauss_pt_states, CommunicationPlan& comm_plan)
@@ -115,6 +117,12 @@ struct GaussPoint_t
                         this->fields.initialize_comm_plan(comm_plan);
                     } 
                     break;
+                case gauss_pt_state::fields_vec:
+                    if (fields_vec.size() == 0){
+                        this->fields_vec = MPICArrayKokkos<double>(num_gauss_pnts, num_dims, "gauss_point_fields_vec");
+                        this->fields_vec.initialize_comm_plan(comm_plan);
+                    } 
+                    break;
                 default:
                     std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"<<std::endl;
                     throw std::runtime_error("**** Error in State Field Name ****");

From 0ed9c811da5bc03d4ae6912e553f793ca8cd9015 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Thu, 6 Nov 2025 16:29:24 -0600
Subject: [PATCH 22/52] STYLE: Renaming a thing and headed home

---
 examples/mesh_decomp/mpi_type.h | 52 ++++++++++++++++++++++++---------
 examples/mesh_decomp/state.h    |  6 +---
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index 858705d7..c49977c0 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -166,8 +166,8 @@ class MPICArrayKokkos {
     MPICArrayKokkos& operator=(const MPICArrayKokkos& temp);
 
 
-    // Method to set comm plan
-    void initialize_comm_plan(CommunicationPlan& comm_plan){
+    // Method to set comm plan for halo communication
+    void initialize_mesh_comm_plan(CommunicationPlan& comm_plan){
         comm_plan_ = &comm_plan;
         
         size_t send_size = comm_plan_->total_send_count * stride_;
@@ -244,11 +244,6 @@ class MPICArrayKokkos {
     // Such that all the boundary elements going to a given rank are contiguous in the send buffer.
     void fill_send_buffer(){
 
-
-      
-        T* src_ptr = this_array_.host_pointer();
-
-        
         size_t send_idx = 0;
         for(int i = 0; i < comm_plan_->num_send_ranks; i++){
             for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
@@ -256,7 +251,7 @@ class MPICArrayKokkos {
                 
                 // Copy all values associated with this element (handles multi-dimensional arrays)
                 for(size_t k = 0; k < stride_; k++){
-                    send_buffer_.host(send_idx + k) = src_ptr[src_idx * stride_ + k];
+                    send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k];
                 }
                 send_idx += stride_;
             }
@@ -265,9 +260,7 @@ class MPICArrayKokkos {
 
     // Method that copies the recv buffer into the this_array
     void copy_recv_buffer(){
-        
-        T* dest_ptr = this_array_.host_pointer();
-        
+
         size_t recv_idx = 0;
         for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
             for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){
@@ -275,7 +268,7 @@ class MPICArrayKokkos {
                 
                 // Copy all values associated with this element (handles multi-dimensional arrays)
                 for(size_t k = 0; k < stride_; k++){
-                    dest_ptr[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k);
+                    this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k);
                 }
                 
                 recv_idx += stride_;
@@ -317,6 +310,38 @@ class MPICArrayKokkos {
         this_array_.update_device();
     };
 
+
+
+    // MPI send wrapper
+    void send(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI recieve wrapper
+    void recv(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI broadcast wrapper
+    void broadcast(size_t count, int root, MPI_Comm comm);
+
+    // MPI scatter wrapper
+    void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+
+    // MPI gather wrapper
+    void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+
+    // MPI allgather wrapper
+    void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
+
+    // MPI send wrapper
+    void isend(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI recieve wrapper
+    void irecv(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI wait wrapper for sender
+    void wait_send();
+
+    // MPI wait wrapper for receiver
+    void wait_recv();
+
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
     ~MPICArrayKokkos ();
@@ -531,8 +556,7 @@ void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
-    // Member variables (this_array_, send_buffer_, recv_buffer_) are automatically
-    // destroyed by the compiler - no explicit cleanup needed
+
 }
 
 #endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 556039da..385723ed 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -97,11 +97,7 @@ enum class gauss_pt_state
 struct GaussPoint_t
 {
 
-    //DCArrayKokkos<double> fields;  ///< GaussPoint fields
-
-
     MPICArrayKokkos<double> fields;
-
     MPICArrayKokkos<double> fields_vec;
 
     // initialization method (num_cells, num_dims)
@@ -120,7 +116,7 @@ struct GaussPoint_t
                 case gauss_pt_state::fields_vec:
                     if (fields_vec.size() == 0){
                         this->fields_vec = MPICArrayKokkos<double>(num_gauss_pnts, num_dims, "gauss_point_fields_vec");
-                        this->fields_vec.initialize_comm_plan(comm_plan);
+                        this->fields_vec.initialize_mesh_comm_plan(comm_plan);
                     } 
                     break;
                 default:

From 4447a5ccc614eece4e23610bfa2f2941ec5760fc Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 7 Nov 2025 14:09:04 -0600
Subject: [PATCH 23/52] ENH: Tidying up

---
 examples/mesh_decomp/communication_plan.h |   3 -
 examples/mesh_decomp/decomp_utils.h       | 259 +++++++---------------
 examples/mesh_decomp/mpi_type.h           | 105 ++++++---
 examples/mesh_decomp/state.h              |   2 +-
 4 files changed, 153 insertions(+), 216 deletions(-)

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index 16904e57..63391262 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -59,9 +59,6 @@ using namespace mtr;
     int reorder = 0; 
 
 
-
-
-
     DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank
     DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank
 
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index e3421259..3c2cefc4 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -22,30 +22,19 @@
 #include "ptscotch.h"
 
 
-
-
-
-
-void partition_mesh(
+void naive_partition_mesh(
     Mesh_t& initial_mesh,
-    Mesh_t& final_mesh,
     node_t& initial_node,
-    node_t& final_node,
-    GaussPoint_t& gauss_point,
+    Mesh_t& naive_mesh,
+    node_t& naive_node,
+    std::vector<int>& elems_in_elem_on_rank,
+    std::vector<int>& num_elems_in_elem_per_rank,
     int world_size,
-    int rank){
+    int rank)
+{
 
-    bool print_info = false;
-    bool print_vtk = false;
 
-    // Create mesh, gauss points, and node data structures on each rank
-    // This is the initial partitioned mesh
-    Mesh_t naive_mesh;
-    node_t naive_node;
-
-    // Mesh partitioned by pt-scotch, not including ghost
-    Mesh_t intermediate_mesh; 
-    node_t intermediate_node;
+    bool print_info = false;
 
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
@@ -68,10 +57,6 @@ void partition_mesh(
     // Create a 2D vector to hold the nodal positions on each rank
     std::vector<std::vector<double>> node_pos_to_send(world_size);
 
-    // create a 2D vector to hold the node positions on each rank
-    std::vector<std::vector<double>> node_pos_on_rank(world_size);
-
-
     if (rank == 0) {
 
         num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
@@ -89,9 +74,9 @@ void partition_mesh(
     MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
     MPI_Barrier(MPI_COMM_WORLD);
 
-// ********************************************************  
-//        Scatter the number of elements to each rank
-// ******************************************************** 
+    // ********************************************************  
+    //        Scatter the number of elements to each rank
+    // ******************************************************** 
     // All ranks participate in the scatter operation
     // MPI_Scatter signature:
     // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
@@ -110,9 +95,9 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
     double t_scatter_end = MPI_Wtime();
 
-// ********************************************************  
-//     Scatter the actual element global ids to each rank
-// ******************************************************** 
+    // ********************************************************  
+    //     Scatter the actual element global ids to each rank
+    // ******************************************************** 
     double t_scatter_gids_start = MPI_Wtime();
 
     if (rank == 0) {
@@ -169,34 +154,10 @@ void partition_mesh(
     // Wait for all ranks to complete the scatter operation
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Timer: End measuring time for scattering element global ids
-    double t_scatter_gids_end = MPI_Wtime();
-    if(rank == 0 && print_info) {
-        std::cout<<" Finished scattering the actual element global ids to each rank"<<std::endl;
-        std::cout << " Scattering the actual element global ids to each rank took " 
-                << (t_scatter_gids_end - t_scatter_gids_start) << " seconds." << std::endl;
-    }
-
-
-    if (print_info) {
-        std::cout << "Rank " << rank << " received elements: ";
-        for (int i = 0; i < num_elements_on_rank; i++) {
-            std::cout << elements_on_rank[i] << " ";
-        }
-        std::cout << std::endl;
-    }
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-// ****************************************************************************************** 
-//     Scatter the number of nodes to each rank and compute which nodes to send to each rank
-// ****************************************************************************************** 
-
-    // Timer: Start measuring time for node scattering
-    double t_scatter_nodes_start = MPI_Wtime();
 
+    // ****************************************************************************************** 
+    //     Scatter the number of nodes to each rank and compute which nodes to send to each rank
+    // ****************************************************************************************** 
     if (rank == 0) {
 
         // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
@@ -246,20 +207,9 @@ void partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Timer: End measuring time for node scattering
-    double t_scatter_nodes_end = MPI_Wtime();
-
-    if(rank == 0) {
-        std::cout<<" Finished scattering the number of nodes to each rank"<<std::endl;
-        std::cout << " Scattering the number of nodes to each rank took " 
-                  << (t_scatter_nodes_end - t_scatter_nodes_start) << " seconds." << std::endl;
-    }
-
-
-
-// ****************************************************************************************** 
-//     Scatter the actual node global ids to each rank
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Scatter the actual node global ids to each rank
+    // ****************************************************************************************** 
     // Timer: Start measuring time for scattering node global ids
     double t_scatter_nodeids_start = MPI_Wtime();
 
@@ -303,20 +253,9 @@ void partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Timer: End measuring time for scattering node global ids
-    double t_scatter_nodeids_end = MPI_Wtime();
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) {
-        std::cout<<" Finished scattering the actual node global ids to each rank"<<std::endl;
-        std::cout << " Scattering node global ids took "
-                  << (t_scatter_nodeids_end - t_scatter_nodeids_start) << " seconds." << std::endl;
-    }
-
-
-// ****************************************************************************************** 
-//     Scatter the node positions to each rank
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Scatter the node positions to each rank
+    // ****************************************************************************************** 
     // Create a flat 1D vector for node positions (3 coordinates per node)
     std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
 
@@ -366,43 +305,9 @@ void partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    if (rank == 0 && print_info) {
-        // Print out the node positions on this rank
-        std::cout << "Rank " << rank << " received node positions: ";
-        for (int i = 0; i < num_nodes_on_rank; i++) {
-            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
-                      << node_pos_on_rank_flat[i*3+1] << ", " 
-                      << node_pos_on_rank_flat[i*3+2] << ") ";
-        }
-        std::cout << std::endl;
-    }
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (rank == 1 && print_info) {
-        // Print out the node positions on this rank
-        std::cout << "Rank " << rank << " received node positions: ";
-        for (int i = 0; i < num_nodes_on_rank; i++) {
-            std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " 
-                      << node_pos_on_rank_flat[i*3+1] << ", " 
-                      << node_pos_on_rank_flat[i*3+2] << ") ";
-        }
-        std::cout << std::endl;
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    double t_scatter_nodepos_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the node positions to each rank"<<std::endl;
-        std::cout << " Scattering node positions took "
-                  << (t_scatter_nodepos_end - t_scatter_nodepos_start) << " seconds." << std::endl;
-    }
-
-// ****************************************************************************************** 
-//     Initialize the node state variables
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Initialize the node state variables
+    // ****************************************************************************************** 
 
     // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
     std::vector<node_state> required_node_state = { node_state::coords };
@@ -417,9 +322,9 @@ void partition_mesh(
     naive_node.coords.update_device();
 
 
-// ****************************************************************************************** 
-//     Send the element-node connectivity data from the initial mesh to each rank
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Send the element-node connectivity data from the initial mesh to each rank
+    // ****************************************************************************************** 
 
     // Send the element-node connectivity data from the initial mesh to each rank
     std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
@@ -457,38 +362,15 @@ void partition_mesh(
                      0, MPI_COMM_WORLD);
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    double t_scatter_elemnode_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl;
-        std::cout << " Scattering element-node connectivity took "
-                  << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl;
-    }
-
-    if (rank == 0 && print_info) {
-
-        std::cout << "Rank " << rank << " received element-node connectivity (" 
-                << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " nodes: ";
-            for (int node = 0; node < num_nodes_per_elem; node++) {
-                int idx = elem * num_nodes_per_elem + node;
-                std::cout << nodes_in_elem_on_rank[idx] << " ";
-            }
-            std::cout << std::endl;
-        }
-    }
-
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"<<std::endl;
 
 
 
 
-// ****************************************************************************************** 
-//     Send the element-element connectivity data from the initial mesh to each rank
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Send the element-element connectivity data from the initial mesh to each rank
+    // ****************************************************************************************** 
 
     // First, rank 0 computes how many connectivity entries each rank will receive
     // and scatters that information
@@ -505,28 +387,9 @@ void partition_mesh(
             for(int k = 0; k < elements_to_send[i].size(); k++) {
                 elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]);
             }
-
-            if(print_info) std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl;
-        }
-
-        // Print element-element connectivity entries for each rank in the initial mesh
-        if(print_info) {
-            for(int i = 0; i < world_size; i++) {
-                std::cout << std::endl;
-                std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "<<std::endl;
-                for(int k = 0; k < elements_to_send[i].size(); k++) {
-                    std::cout << "Element " << elements_to_send[i][k] << " has " << initial_mesh.num_elems_in_elem(elements_to_send[i][k]) << " element-element connectivity entries: ";
-                    for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
-                        std::cout << initial_mesh.elems_in_elem(elements_to_send[i][k], l) << " ";
-                    }
-                    std::cout << std::endl;
-                }
-                std::cout << std::endl;
-            }
         }
     }
     
-
     // Define total_elem_elem_entries to be the sum of the elem_elem_counts
     // Scatter the counts to each rank
     MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT,
@@ -541,10 +404,10 @@ void partition_mesh(
                  << (t_scatter_elem_elem_end - t_scatter_elem_elem_start) << " seconds." << std::endl;
     }
 
-    std::vector<int> elems_in_elem_on_rank(total_elem_elem_entries);
+    elems_in_elem_on_rank.resize(total_elem_elem_entries);
     
     // Now scatter the num_elems_in_elem for each element on each rank
-    std::vector<int> num_elems_in_elem_per_rank(num_elements_on_rank);
+    num_elems_in_elem_per_rank.resize(num_elements_on_rank);
     
     if (rank == 0) {
         std::vector<int> all_num_elems_in_elem;
@@ -645,9 +508,9 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
 
 
-// ****************************************************************************************** 
-//     Initialize the naive_mesh data structures for each rank
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Initialize the naive_mesh data structures for each rank
+    // ****************************************************************************************** 
     naive_mesh.initialize_nodes(num_nodes_on_rank);
     naive_mesh.initialize_elems(num_elements_on_rank, 3);
 
@@ -714,12 +577,40 @@ void partition_mesh(
 
     naive_mesh.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
+    
 
+    return;
+}
+
+void partition_mesh(
+    Mesh_t& initial_mesh,
+    Mesh_t& final_mesh,
+    node_t& initial_node,
+    node_t& final_node,
+    GaussPoint_t& gauss_point,
+    int world_size,
+    int rank){
+
+    bool print_info = false;
+    bool print_vtk = false;
+
+    // Create mesh, gauss points, and node data structures on each rank
+    // This is the initial partitioned mesh
+    Mesh_t naive_mesh;
+    node_t naive_node;
+
+    // Mesh partitioned by pt-scotch, not including ghost
+    Mesh_t intermediate_mesh; 
+    node_t intermediate_node;
+
+
+    // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh
+    std::vector<int> elems_in_elem_on_rank;
+    std::vector<int> num_elems_in_elem_per_rank;
+
+    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
 
 
-    // if (print_vtk) {
-    //     write_vtk(naive_mesh, naive_node, rank);
-    // }
 
 
 
@@ -784,7 +675,7 @@ void partition_mesh(
      *         neighbors it has.
      *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
      *
-     **********************************************************************************/
+    **********************************************************************************/
 
     // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
     SCOTCH_Dgraph dgraph;
@@ -814,9 +705,10 @@ void partition_mesh(
     // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
     std::map<int, size_t> elem_gid_to_offset;
     size_t current_offset = 0;
-    for (size_t k = 0; k < num_elements_on_rank; k++) {
-        elem_gid_to_offset[elements_on_rank[k]] = current_offset;
-        current_offset += num_elems_in_elem_per_rank[k];
+    for (size_t k = 0; k < naive_mesh.num_elems; k++) {
+        int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
+        elem_gid_to_offset[elem_gid_on_rank] = current_offset;
+        current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH
     }
 
     // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
@@ -836,8 +728,9 @@ void partition_mesh(
         // For this element, find the count of its neighbors
         // This requires finding its index in the elements_on_rank array
         size_t idx = 0;
-        for (size_t k = 0; k < num_elements_on_rank; k++) {
-            if (elements_on_rank[k] == elem_gid) {
+        for (size_t k = 0; k < naive_mesh.num_elems; k++) {
+            int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
+            if (elem_gid_on_rank == elem_gid) {
                 idx = k;
                 break;
             }
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index c49977c0..98f62313 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -116,6 +116,12 @@ class MPICArrayKokkos {
     // Data member to access host view
     ViewCArray <T> host;
 
+
+    // Note, consider this for sending blocks without dealing with stride_
+    // MPI_Datatype vector_type;
+    // MPI_Type_contiguous(stride_, mpi_type_map<T>::value(), &vector_type);
+    // MPI_Type_commit(&vector_type);
+
     MPICArrayKokkos();
     
     MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
@@ -167,7 +173,7 @@ class MPICArrayKokkos {
 
 
     // Method to set comm plan for halo communication
-    void initialize_mesh_comm_plan(CommunicationPlan& comm_plan){
+    void initialize_comm_plan(CommunicationPlan& comm_plan){
         comm_plan_ = &comm_plan;
         
         size_t send_size = comm_plan_->total_send_count * stride_;
@@ -288,6 +294,8 @@ class MPICArrayKokkos {
         // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
 
     // Method that communicates the data between the ranks
+    // NOTE: This is a blocking communication operation, 
+    // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv
     void communicate(){
 
         this_array_.update_host();
@@ -310,37 +318,43 @@ class MPICArrayKokkos {
         this_array_.update_device();
     };
 
+    void set_values(const T& value){
+        this_array_.set_values(value);
+    };
+
 
+    void reduce_sum(T& result){};
 
-    // MPI send wrapper
-    void send(size_t count, int dest, int tag, MPI_Comm comm);
 
-    // MPI recieve wrapper
-    void recv(size_t count, int dest, int tag, MPI_Comm comm);
+    // // MPI send wrapper
+    // void send(size_t count, int dest, int tag, MPI_Comm comm);
 
-    // MPI broadcast wrapper
-    void broadcast(size_t count, int root, MPI_Comm comm);
+    // // MPI recieve wrapper
+    // void recv(size_t count, int dest, int tag, MPI_Comm comm);
 
-    // MPI scatter wrapper
-    void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+    // // MPI broadcast wrapper
+    // void broadcast(size_t count, int root, MPI_Comm comm);
 
-    // MPI gather wrapper
-    void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+    // // MPI scatter wrapper
+    // void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
 
-    // MPI allgather wrapper
-    void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
+    // // MPI gather wrapper
+    // void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
 
-    // MPI send wrapper
-    void isend(size_t count, int dest, int tag, MPI_Comm comm);
+    // // MPI allgather wrapper
+    // void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
 
-    // MPI recieve wrapper
-    void irecv(size_t count, int dest, int tag, MPI_Comm comm);
+    // // MPI send wrapper
+    // void isend(size_t count, int dest, int tag, MPI_Comm comm);
 
-    // MPI wait wrapper for sender
-    void wait_send();
+    // // MPI recieve wrapper
+    // void irecv(size_t count, int dest, int tag, MPI_Comm comm);
 
-    // MPI wait wrapper for receiver
-    void wait_recv();
+    // // MPI wait wrapper for sender
+    // void wait_send();
+
+    // // MPI wait wrapper for receiver
+    // void wait_recv();
 
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
@@ -350,12 +364,17 @@ class MPICArrayKokkos {
 // Default constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
-    : this_array_(), stride_(1) { }
+    : this_array_(), stride_(1), length_(0), order_(0) {
+        for (int i = 0; i < 7; i++) {
+            dims_[i] = 0;
+        }
+    }
 
 // Overloaded 1D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) 
-    : stride_(1) {
+    : stride_(1), length_(dim0), order_(1) {
+    dims_[0] = dim0;
     this_array_ = DCArrayKokkos<T>(dim0, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0);
 }
@@ -363,7 +382,10 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, c
 // Overloaded 2D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) 
-    : stride_(dim1) {
+    : stride_(dim1), length_(dim0 * dim1), order_(2) {
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+
     this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1);
 }
@@ -371,7 +393,10 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, s
 // Overloaded 3D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) 
-    : stride_(dim1 * dim2) {
+    : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) {
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2);
 }
@@ -379,7 +404,11 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, s
 // Overloaded 4D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3) {
+    : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) {
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3);
 }
@@ -387,7 +416,12 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, s
 // Overloaded 5D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3 * dim4) {
+    : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) {
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4);
 }
@@ -395,7 +429,13 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, s
 // Overloaded 6D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3 * dim4 * dim5) {
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) {
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5);
 }
@@ -403,7 +443,14 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, s
 // Overloaded 7D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6) {
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) {
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    dims_[6] = dim6;
     this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
     host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
 }
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 385723ed..2ed970d5 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -116,7 +116,7 @@ struct GaussPoint_t
                 case gauss_pt_state::fields_vec:
                     if (fields_vec.size() == 0){
                         this->fields_vec = MPICArrayKokkos<double>(num_gauss_pnts, num_dims, "gauss_point_fields_vec");
-                        this->fields_vec.initialize_mesh_comm_plan(comm_plan);
+                        this->fields_vec.initialize_comm_plan(comm_plan);
                     } 
                     break;
                 default:

From 588fec59005a2ac958314c28850c69b3d9f6697f Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 7 Nov 2025 14:47:48 -0600
Subject: [PATCH 24/52] ENH: Tidying up

---
 examples/mesh_decomp/communication_plan.h |  26 +--
 examples/mesh_decomp/decomp_utils.h       | 228 ++--------------------
 examples/mesh_decomp/mesh_decomp.cpp      |  13 +-
 3 files changed, 31 insertions(+), 236 deletions(-)

diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index 63391262..eabba8da 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -278,31 +278,11 @@ using namespace mtr;
         }
         this->recv_displs_.update_device();
 
-
-        // Print the send and recv data sequentially per MPI rank for clarity
-        MPI_Barrier(mpi_comm_world);
-        int rank, nprocs;
-        MPI_Comm_rank(mpi_comm_world, &rank);
-        MPI_Comm_size(mpi_comm_world, &nprocs);
-        for(int r = 0; r < nprocs; r++) {
-            MPI_Barrier(mpi_comm_world);
-            if(rank == r) {
-                std::cout << "==============================" << std::endl;
-                std::cout << "CommunicationPlan info for rank " << rank << std::endl;
-                for(int i = 0; i < num_send_ranks; i++){
-                    std::cout << "  Send count to rank[" << i << "] (dest rank " << this->send_rank_ids.host(i) << "): " << this->send_counts_.host(i) << std::endl;
-                    std::cout << "  Send displs to rank[" << i << "]: " << this->send_displs_.host(i) << std::endl;
-                }
-                for(int i = 0; i < num_recv_ranks; i++){
-                    std::cout << "  Recv count from rank[" << i << "] (source rank " << this->recv_rank_ids.host(i) << "): " << this->recv_counts_.host(i) << std::endl;
-                    std::cout << "  Recv displs from rank[" << i << "]: " << this->recv_displs_.host(i) << std::endl;
-                }
-                std::cout << "==============================" << std::endl << std::flush;
-            }
-        }
         MPI_Barrier(mpi_comm_world);
     }
 
 };
 
-#endif // COMMUNICATION_PLAN_H
\ No newline at end of file
+#endif // COMMUNICATION_PLAN_H
+
+
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 3c2cefc4..bab3f35d 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -33,7 +33,6 @@ void naive_partition_mesh(
     int rank)
 {
 
-
     bool print_info = false;
 
     int num_elements_on_rank = 0;
@@ -176,8 +175,6 @@ void naive_partition_mesh(
         }
 
         if (print_info) {
-
-            
             std::cout<<std::endl;
             // print the nodes_to_send array
             for (int i = 0; i < world_size; i++) {
@@ -363,9 +360,6 @@ void naive_partition_mesh(
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"<<std::endl;
-
-
 
 
     // ****************************************************************************************** 
@@ -397,12 +391,6 @@ void naive_partition_mesh(
                 0, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
-    double t_scatter_elem_elem_end = MPI_Wtime();
-    if(rank == 0) {
-        std::cout<<" Finished scattering the number of element-element connectivity entries to each rank"<<std::endl;
-        std::cout<<" Scattering element-element connectivity counts took "
-                 << (t_scatter_elem_elem_end - t_scatter_elem_elem_start) << " seconds." << std::endl;
-    }
 
     elems_in_elem_on_rank.resize(total_elem_elem_entries);
     
@@ -432,7 +420,6 @@ void naive_partition_mesh(
     }
     
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"<<std::endl;
 
     if (rank == 0){
         // Prepare the element-element connectivity data for each rank
@@ -468,45 +455,6 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"<<std::endl;
-
-    if (rank == 0 && print_info) {
-        std::cout << "Rank " << rank << " received element-element connectivity (" 
-                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
-        
-        int offset = 0;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " has neighbors: ";
-            int num_neighbors = num_elems_in_elem_per_rank[elem];
-            for (int j = 0; j < num_neighbors; j++) {
-                std::cout << elems_in_elem_on_rank[offset + j] << " ";
-            }
-            offset += num_neighbors;
-            std::cout << std::endl;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    if (rank == 1 && print_info) {
-        std::cout << "Rank " << rank << " received element-element connectivity (" 
-                << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl;
-        
-        int offset = 0;
-        for (int elem = 0; elem < num_elements_on_rank; elem++) {
-            std::cout << "  Element " << elem << " has neighbors: ";
-            int num_neighbors = num_elems_in_elem_per_rank[elem];
-            for (int j = 0; j < num_neighbors; j++) {
-                std::cout << elems_in_elem_on_rank[offset + j] << " ";
-            }
-            offset += num_neighbors;
-            std::cout << std::endl;
-        }
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
 
     // ****************************************************************************************** 
     //     Initialize the naive_mesh data structures for each rank
@@ -529,7 +477,6 @@ void naive_partition_mesh(
     naive_mesh.local_to_global_elem_mapping.update_device();
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
 
     // Timer for reverse mapping of element-node connectivity
     double t_reverse_map_start = MPI_Wtime();
@@ -564,7 +511,7 @@ void naive_partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
 
     double t_reverse_map_end = MPI_Wtime();
-    if(rank == 0) {
+    if(rank == 0 && print_info) {
         std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
         std::cout<<" Reverse mapping time: " << (t_reverse_map_end - t_reverse_map_start) << " seconds." << std::endl;
     }
@@ -611,9 +558,6 @@ void partition_mesh(
     naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
 
 
-
-
-
 // ****************************************************************************************** 
 //     Compute a repartition of the mesh using pt-scotch
 // ****************************************************************************************** 
@@ -1533,21 +1477,22 @@ void partition_mesh(
     
     MPI_Barrier(MPI_COMM_WORLD);
     // Sequential rank-wise printing of extended mesh structure info
-    for (int r = 0; r < world_size; ++r) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
-            std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
-            std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
-            std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
-            std::cout << "[rank " << rank << "]   - Owned nodes: " << intermediate_mesh.num_nodes << std::endl;
-            std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
-            std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
-            std::cout << std::flush;
+    if(print_info) {
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
+                std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned nodes: " << intermediate_mesh.num_nodes << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
+                std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
+                std::cout << std::flush;
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
         }
-        MPI_Barrier(MPI_COMM_WORLD);
     }
-    
     // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
     // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
     
@@ -1582,51 +1527,17 @@ void partition_mesh(
         }
     }
 
-    // Optional: Print ghost element receive pattern
-    if (print_info) {
-        for (int r = 0; r < world_size; ++r) {
-            MPI_Barrier(MPI_COMM_WORLD);
-            if (rank == r) {
-                std::cout << "[rank " << rank << "] Ghost element receive pattern:" << std::endl;
-                for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
-                    size_t ghost_ext_lid = intermediate_mesh.num_elems + i;
-                    std::cout << "  Ghost elem ext_lid=" << ghost_ext_lid 
-                              << " gid=" << ghost_elem_gids_ordered[i]
-                              << " receives from rank " << ghost_elem_owner_ranks[i] << std::endl;
-                }
-            }
-            MPI_Barrier(MPI_COMM_WORLD);
-        }
-    }
-
     // Create a std::set of all the ranks this rank will receive data from
     std::set<int> ghost_elem_receive_ranks;
     for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
         ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]);
     }
 
-
-    // Print with ranks this rank will receive element data from sequentially
-    for (int r = 0; r < world_size; ++r) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "[rank " << rank << "] Ranks this rank will receive element data from: ";
-            for (int rank : ghost_elem_receive_ranks) {
-                std::cout << rank << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
 // ****************************************************************************************** 
 //     Build the final partitioned mesh
 // ****************************************************************************************** 
 
 
-
-
     final_mesh.initialize_nodes(total_extended_nodes);
     final_mesh.initialize_elems(total_extended_elems, 3);
     final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
@@ -1643,25 +1554,10 @@ void partition_mesh(
     final_mesh.num_ghost_elems = ghost_elem_gids.size();
     final_mesh.num_ghost_nodes = ghost_only_nodes.size();
     
-    // Set owned counts for write_vtk (excludes ghost elements/nodes)
+
     final_mesh.num_owned_elems = intermediate_mesh.num_elems;
     final_mesh.num_owned_nodes = intermediate_mesh.num_nodes;
 
-
-    // Print num ghost elements and nodes on each rank sequentially
-    for (int r = 0; r < world_size; ++r) {
-        if (rank == r) {
-            std::cout << "*******[rank " << rank << "]   - Ghost elements: " << final_mesh.num_ghost_elems << std::endl;
-            std::cout << "*******[rank " << rank << "]   - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-
     MPI_Barrier(MPI_COMM_WORLD);
     // rebuild the local element-node connectivity using the local node ids
     // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
@@ -1672,11 +1568,10 @@ void partition_mesh(
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
 
     final_mesh.nodes_in_elem.update_device();
-
     final_mesh.build_connectivity();
+
     MPI_Barrier(MPI_COMM_WORLD);
     
     if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
@@ -1822,12 +1717,6 @@ void partition_mesh(
                    all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl;
-    
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl;
     
     // Build map gid -> ranks that ghost it
     std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
@@ -1853,38 +1742,8 @@ void partition_mesh(
         }
     }
 
-    std::cout.flush();
     MPI_Barrier(MPI_COMM_WORLD);
     
-
-    // Optional: print a compact summary of reverse map for verification (limited output)
-    for(int i = 0; i < world_size; i++) {
-        if (rank == i && print_info) {
-            std::cout << std::endl;
-            for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) {
-
-                size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
-                if (boundary_elem_targets[elem_lid].empty()) 
-                {
-                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl;
-                }
-                else
-                {
-                    std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" -  elem_gid: " << local_elem_gid << " sends to: ";
-                    int shown = 0;
-                    for (const auto &pr : boundary_elem_targets[elem_lid]) {
-                        if (shown >= 12) { std::cout << " ..."; break; }
-                        std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
-                        shown++;
-                    }
-                    std::cout << std::endl;
-                }
-            }
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
     // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
     std::vector<int> boundary_elem_local_ids;
     std::vector<std::vector<int>> boundary_to_ghost_ranks;  // ragged array dimensions (num_boundary_elems, num_ghost_ranks)
@@ -1923,40 +1782,6 @@ void partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    for (int r = 0; r < world_size; ++r) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << std::endl;
-            std::cout << "[rank " << rank << "] elements communicates to ranks: ";
-            for (int i = 0; i < num_ghost_comm_ranks; ++i) {
-                std::cout << ghost_comm_ranks_vec[i] << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-    print_info = false;
-
-    // Print out the boundary element local ids on each rank sequentially
-    for (int r = 0; r < world_size; ++r) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r && print_info) {
-            std::cout << std::endl;
-            std::cout << "[rank " << rank << "] Boundary element global ids: " <<std::endl;
-            for (int bid = 0; bid < boundary_elem_local_ids.size(); ++bid) {
-                size_t global_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(boundary_elem_local_ids[bid]);
-                std::cout <<"Boundary element local id: " << boundary_elem_local_ids[bid] << " - Global id: " << global_elem_gid << " - Ghost ranks: ";
-                for (int j = 0; j < boundary_to_ghost_ranks[bid].size(); ++j) {
-                    std::cout << boundary_to_ghost_ranks[bid][j] << " ";
-                }
-                std::cout << std::endl;
-            }
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
     final_mesh.num_boundary_elems = boundary_elem_local_ids.size();
     final_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(final_mesh.num_boundary_elems);
     for (int i = 0; i < final_mesh.num_boundary_elems; i++) {
@@ -2084,27 +1909,9 @@ void partition_mesh(
         }
     }
     elems_to_recv_by_rank_rr.update_device();
-
-    // Debug: Print send vs recv counts per neighbor to diagnose mismatch
-    if (print_info) {
-        std::cout << "[rank " << rank << "] Send/Recv count comparison:" << std::endl;
-        for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
-            int dest_rank = element_communication_plan.send_rank_ids.host(i);
-            int send_count = elems_to_send_by_rank_rr.stride_host(i);
-            std::cout << "  To rank " << dest_rank << ": sending " << send_count << " elements" << std::endl;
-        }
-        for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
-            int src_rank = element_communication_plan.recv_rank_ids.host(i);
-            int recv_count = elems_to_recv_by_rank_rr.stride_host(i);
-            std::cout << "  From rank " << src_rank << ": expecting " << recv_count << " elements" << std::endl;
-        }
-    }
-
     element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished building the send and recv counts and displacements for element communication" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
     
 // ****************************************************************************************** 
 //     Test element communication using MPI_Neighbor_alltoallv
@@ -2119,7 +1926,6 @@ void partition_mesh(
     // Initialize the gauss point fields on each rank
     // Set owned elements to rank number, ghost elements to -1 (to verify communication)
     for (int i = 0; i < final_mesh.num_owned_elems; i++) {
-        // if(rank == 0) std::cout << " Setting owned element " << i << " to rank " << rank << std::endl;
         gauss_point.fields.host(i) = static_cast<double>(rank);
         gauss_point.fields_vec.host(i, 0) = static_cast<double>(rank);
         gauss_point.fields_vec.host(i, 1) = static_cast<double>(rank);
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index b14ee9cd..1106d99f 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {50, 50, 50};
+    int num_elems_dim[3] = {180, 180, 180};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
@@ -54,15 +54,24 @@ int main(int argc, char** argv) {
     if (rank == 0) {
         std::cout<<"World size: "<<world_size<<std::endl;
         std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
-    
+
         std::cout<<"Initializing mesh"<<std::endl;
         build_3d_box(initial_mesh,  initial_node, origin, length, num_elems_dim);
+
+        double t_init_mesh_end = MPI_Wtime();
+        std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl;
     }
+    MPI_Barrier(MPI_COMM_WORLD);
 
 // ********************************************************  
 //             Partition and balance the mesh
 // ********************************************************  
+    double t_partition_start = MPI_Wtime();
     partition_mesh(initial_mesh, final_mesh, initial_node, final_node, gauss_point, world_size, rank);
+    double t_partition_end = MPI_Wtime();
+    if(rank == 0) {
+        printf("Mesh partitioning time: %.2f seconds\n", t_partition_end - t_partition_start);
+    }
 
     // write_vtk(intermediate_mesh, intermediate_node, rank);
     MPI_Barrier(MPI_COMM_WORLD);

From ba6a9981166bd3c6dc496cc4dd7959cfaa603298 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 7 Nov 2025 16:47:28 -0600
Subject: [PATCH 25/52] ENH: Attempting to simplify building ghost and having a
 bad time

---
 examples/mesh_decomp/decomp_utils.h  | 248 +++++++++++++--------------
 examples/mesh_decomp/mesh_decomp.cpp |   5 +-
 2 files changed, 120 insertions(+), 133 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index bab3f35d..3d50e682 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -63,7 +63,7 @@ void naive_partition_mesh(
         // Compute elements to send to each rank; handle remainders for non-even distribution
         std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size);
         int remainder = initial_mesh.num_elems % world_size;
-        for (int i = 0; i < remainder; ++i) {
+        for (int i = 0; i < remainder; i++) {
             elems_per_rank[i] += 1;
         }
     }
@@ -529,6 +529,8 @@ void naive_partition_mesh(
     return;
 }
 
+
+
 void partition_mesh(
     Mesh_t& initial_mesh,
     Mesh_t& final_mesh,
@@ -658,7 +660,7 @@ void partition_mesh(
     // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
     SCOTCH_Num offset = 0; // running count of edges encountered
 
-    for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) {
+    for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
 
         // Record current edge offset for vertex lid in vertloctab
         vertloctab[lid] = offset;
@@ -876,7 +878,7 @@ void partition_mesh(
     print_info = false;
     for(int rank_id = 0; rank_id < world_size; rank_id++) {
         if(rank_id == rank && print_info) {
-            for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) {
+            for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
                 size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid);
                 std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
                         << " -> part=" << partloctab[lid] << "\n";
@@ -887,9 +889,6 @@ void partition_mesh(
     }
     print_info = false;
 
-
-
-
 // ****************************************************************************************** 
 //     Build the final mesh from the repartition
 // ****************************************************************************************** 
@@ -902,7 +901,7 @@ void partition_mesh(
 
     // -------------- Phase 1: Determine elements to send to each rank --------------
     std::vector<std::vector<int>> elems_to_send(world_size);
-    for (int lid = 0; lid < naive_mesh.num_elems; ++lid) {
+    for (int lid = 0; lid < naive_mesh.num_elems; lid++) {
         int dest = static_cast<int>(partloctab[lid]);
         int elem_gid = static_cast<int>(naive_mesh.local_to_global_elem_mapping.host(lid));
         elems_to_send[dest].push_back(elem_gid);
@@ -929,24 +928,24 @@ void partition_mesh(
 
 
     // Flatten send buffer
-    std::vector<int> sendbuf;
-    sendbuf.reserve(send_total);
+    // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks.
+    // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning.
+    std::vector<int> send_elems;
+    send_elems.reserve(send_total);
     for (int r = 0; r < world_size; ++r)
-        sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end());
+        send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end());
 
-    // Receive new local element GIDs
-    std::vector<int> recvbuf(recv_total);
-    MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT,
-                recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+    // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange.
+    // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank.
+    std::vector<int> new_elem_gids(recv_total);
+    MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT,
+                new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
     
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging element GIDs"<<std::endl;
-
+    
     // New elements owned by this rank
-    std::vector<int> new_elem_gids = recvbuf;
     int num_new_elems = static_cast<int>(new_elem_gids.size());
     
-    
     if (print_info) {
         std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
     }
@@ -956,12 +955,12 @@ void partition_mesh(
 
     // Flatten element-node connectivity by global node IDs
     std::vector<int> conn_sendbuf;
-    for (int r = 0; r < world_size; ++r) {
-        for (int gid : elems_to_send[r]) {
-            // find local element lid from gid
+    for (int r = 0; r < world_size; r++) {
+        for (int elem_gid : elems_to_send[r]) {
+            // find local element lid from elem_gid
             int lid = -1;
-            for (int i = 0; i < naive_mesh.num_elems; ++i)
-                if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+            for (int i = 0; i < naive_mesh.num_elems; i++)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; }
 
             for (int j = 0; j < nodes_per_elem; j++) {
                 int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
@@ -973,7 +972,7 @@ void partition_mesh(
 
     // element-node connectivity counts (ints per dest rank)
     std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
-    for (int r = 0; r < world_size; ++r)
+    for (int r = 0; r < world_size; r++)
         conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
 
     MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
@@ -1005,7 +1004,7 @@ void partition_mesh(
 
     // Build map gid→lid
     std::unordered_map<int,int> node_gid_to_lid;
-    for (int i = 0; i < num_new_nodes; ++i)
+    for (int i = 0; i < num_new_nodes; i++)
         node_gid_to_lid[new_node_gids[i]] = i;
 
     if (print_info)
@@ -1017,7 +1016,7 @@ void partition_mesh(
     for (int r = 0; r < world_size; ++r) {
         for (int gid : elems_to_send[r]) {
             int lid = -1;
-            for (int i = 0; i < naive_mesh.num_elems; ++i)
+            for (int i = 0; i < naive_mesh.num_elems; i++)
                 if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
 
             for (int j = 0; j < nodes_per_elem; j++) {
@@ -1063,9 +1062,9 @@ void partition_mesh(
     intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
 
     // Fill global mappings
-    for (int i = 0; i < num_new_nodes; ++i)
+    for (int i = 0; i < num_new_nodes; i++)
         intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
-    for (int i = 0; i < num_new_elems; ++i)
+    for (int i = 0; i < num_new_elems; i++)
         intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
 
     intermediate_mesh.local_to_global_node_mapping.update_device();
@@ -1075,10 +1074,10 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
     // rebuild the local element-node connectivity using the local node ids
-    for(int i = 0; i < num_new_elems; i++) {
-        for(int j = 0; j < nodes_per_elem; j++) {
+    for(int i = 0; i < intermediate_mesh.num_elems; i++) {
+        for(int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
 
-            int node_gid = conn_recvbuf[i * nodes_per_elem + j];
+            int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j];
 
             int node_lid = -1;
 
@@ -1096,7 +1095,6 @@ void partition_mesh(
                     left = mid + 1;
                 }
             }
-
             intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
         }
     }
@@ -1111,9 +1109,9 @@ void partition_mesh(
     // Build a map from node GID to coordinates
     std::map<int, std::array<double, 3>> node_gid_to_coords;
     int coord_idx = 0;
-    for (int e = 0; e < num_new_elems; ++e) {
-        for (int j = 0; j < nodes_per_elem; j++) {
-            int node_gid = conn_recvbuf[e * nodes_per_elem + j];
+    for (int e = 0; e < intermediate_mesh.num_elems; ++e) {
+        for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
+            int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j];
             if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
                 node_gid_to_coords[node_gid] = {
                     coord_recvbuf[coord_idx*3 + 0],
@@ -1127,7 +1125,7 @@ void partition_mesh(
     
     // Now fill coordinates in node order
     intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
-    for (int i = 0; i < num_new_nodes; ++i) {
+    for (int i = 0; i < num_new_nodes; i++) {
         int node_gid = new_node_gids[i];
         auto it = node_gid_to_coords.find(node_gid);
         if (it != node_gid_to_coords.end()) {
@@ -1152,18 +1150,9 @@ void partition_mesh(
     
     // First, gather the number of elements each rank owns
     std::vector<int> elem_counts(world_size);
-
-    // int MPI_Allgather(
-    //     const void* sendbuf,      // Data to send from this process
-    //     int sendcount,            // Number of elements to send
-    //     MPI_Datatype sendtype,    // Type of send data
-    //     void* recvbuf,            // Buffer to receive all data
-    //     int recvcount,            // Number of elements to receive from each process
-    //     MPI_Datatype recvtype,    // Type of receive data
-    //     MPI_Comm comm             // Communicator
-    // );
     MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
+    
     // Compute displacements
     std::vector<int> elem_displs(world_size);
     int total_elems = 0;
@@ -1174,17 +1163,6 @@ void partition_mesh(
     
     // Gather all element GIDs from all ranks
     std::vector<size_t> all_elem_gids(total_elems);
-
-    // int MPI_Allgatherv(
-    //     const void* sendbuf,      // Data to send from this process
-    //     int sendcount,            // Number of elements THIS process sends
-    //     MPI_Datatype sendtype,    // Type of send data
-    //     void* recvbuf,            // Buffer to receive all data
-    //     const int* recvcounts,    // Array: number of elements from each process
-    //     const int* displs,        // Array: displacement for each process's data
-    //     MPI_Datatype recvtype,    // Type of receive data
-    //     MPI_Comm comm             // Communicator
-    // );
     MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
                    all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
@@ -1192,36 +1170,32 @@ void partition_mesh(
     // Build a map: element GID -> owning rank
     std::map<size_t, int> elem_gid_to_rank;
     for (int r = 0; r < world_size; ++r) {
-        for (int i = 0; i < elem_counts[r]; ++i) {
+        for (int i = 0; i < elem_counts[r]; i++) {
             size_t gid = all_elem_gids[elem_displs[r] + i];
             elem_gid_to_rank[gid] = r;
         }
     }
     
-    // Strategy: Find ghost elements by checking neighbors of our boundary elements.
-    // A boundary element is one that has a neighbor owned by another rank.
-    // However, since build_connectivity() only includes locally-owned elements,
-    // we need to use a different approach: find elements on other ranks that share
+    // Strategy: Find elements on other ranks that share
     // nodes with our locally-owned elements.
     
     // First, collect all nodes that belong to our locally-owned elements
     std::set<size_t> local_elem_nodes;
-    for (int lid = 0; lid < num_new_elems; ++lid) {
-        for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-            local_elem_nodes.insert(node_gid);
-        }
+
+    for(int node_rid = 0; node_rid < intermediate_mesh.num_nodes; node_rid++) {
+        size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_rid);
+        local_elem_nodes.insert(node_gid);
     }
     
+    
     // Now collect element-to-node connectivity to send to all ranks
     // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid)
     std::vector<size_t> elem_node_conn;
     int local_conn_size = 0;
     
-    for (int lid = 0; lid < num_new_elems; ++lid) {
+    for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) {
         size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid);
-        for (int j = 0; j < nodes_per_elem; j++) {
+        for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
             size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
             size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
             elem_node_conn.push_back(elem_gid);
@@ -1229,12 +1203,15 @@ void partition_mesh(
         }
         local_conn_size += nodes_per_elem * 2;  // Each pair is 2 size_ts
     }
+
+   
     
     // Exchange element-node connectivity with all ranks using Allgather
     // First, gather the sizes from each rank
     std::vector<int> conn_sizes(world_size);
     MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
+    
     // Compute displacements
     std::vector<int> conn_displs(world_size);
     int total_conn = 0;
@@ -1249,9 +1226,40 @@ void partition_mesh(
                    all_conn.data(), conn_sizes.data(), conn_displs.data(),
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
+
+    
+    DCArrayKokkos<size_t> local_nodes_in_elem(intermediate_mesh.num_elems, intermediate_mesh.num_nodes_in_elem);
+    DCArrayKokkos<size_t> all_nodes_in_elem(total_elems, intermediate_mesh.num_nodes_in_elem);
+
+    std::vector<int> mtr_conn_sizes(world_size);
+    
+
+    local_nodes_in_elem = intermediate_mesh.nodes_in_elem;
+    int mtr_size = intermediate_mesh.num_elems * intermediate_mesh.num_nodes_in_elem;
+
+    MPI_Allgather(&mtr_size, 1, MPI_INT, mtr_conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Compute displacements
+    std::vector<int> mtr_conn_displs(world_size);
+    int total_mtr_conn = 0;
+    for (int r = 0; r < world_size; ++r) {
+        mtr_conn_displs[r] = total_mtr_conn;
+        total_mtr_conn += mtr_conn_sizes[r];
+    }
+
+
+    MPI_Allgatherv(local_nodes_in_elem.host_pointer(), mtr_size, MPI_UNSIGNED_LONG_LONG,
+                   all_nodes_in_elem.host_pointer(), mtr_conn_sizes.data(), mtr_conn_displs.data(),
+                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+
+    
     // create a set for local_elem_gids
     std::set<size_t> local_elem_gids;
-    for (int i = 0; i < num_new_elems; ++i) {
+    for (int i = 0; i < intermediate_mesh.num_elems; i++) {
         local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i));
     }
     
@@ -1261,7 +1269,7 @@ void partition_mesh(
         if (r == rank) continue;  // Skip our own data
         // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2
         int num_pairs = conn_sizes[r] / 2;
-        for (int i = 0; i < num_pairs; ++i) {
+        for (int i = 0; i < num_pairs; i++) {
             // Each pair is 2 size_ts, starting at conn_displs[r]
             int offset = conn_displs[r] + i * 2;
             size_t elem_gid = all_conn[offset];
@@ -1288,24 +1296,25 @@ void partition_mesh(
     // Additional check: elements that are neighbors of our locally-owned elements
     // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
     
-    for (int lid = 0; lid < num_new_elems; ++lid) {
-        size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid);
+    // for (int lid = 0; lid < num_new_elems; lid++) {
+    //     size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid);
         
-        for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
-            size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx);
+    //     for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
+    //         size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx);
             
-            if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
-                size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid);
+    //         if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
+    //             size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid);
                 
-                // Check if neighbor is owned by this rank
-                auto it = elem_gid_to_rank.find(neighbor_gid);
-                if (it != elem_gid_to_rank.end() && it->second != rank) {
-                    // Neighbor is owned by another rank - it's a ghost for us
-                    ghost_elem_gids.insert(neighbor_gid);
-                }
-            }
-        }
-    }
+    //             // Check if neighbor is owned by this rank
+    //             auto it = elem_gid_to_rank.find(neighbor_gid);
+    //             if (it != elem_gid_to_rank.end() && it->second != rank) {
+    //                 // Neighbor is owned by another rank - it's a ghost for us
+    //                 std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl;
+    //                 ghost_elem_gids.insert(neighbor_gid);
+    //             }
+    //         }
+    //     }
+    // }
     
     // Count unique ghost elements
     intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
@@ -1317,32 +1326,6 @@ void partition_mesh(
         std::cout << " Finished calculating ghost elements" << std::endl;
         std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
     }
-    
-    // Print ghost element info if requested
-    print_info = false;
-    for(int i = 0; i < world_size; i++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if(rank == i && print_info) {
-            std::cout << "[rank " << rank << "] owns " << num_new_elems 
-                  << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl;
-            std::cout << "[rank " << rank << "] owned element global IDs: ";
-            for (int j = 0; j < intermediate_mesh.num_elems; j++) {
-                std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " ";
-            }
-
-            // Print global IDs of ghost elements
-            std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: ";
-            for (const auto& gid : ghost_elem_gids) {
-                std::cout << gid << " ";
-            }
-            std::cout << std::endl;
-        }
-        
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-
-
     // Build the connectivity that includes ghost elements
     // Create an extended mesh with owned elements first, then ghost elements appended
     
@@ -1353,17 +1336,18 @@ void partition_mesh(
     // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn)
     std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
     for (const size_t& ghost_gid : ghost_elem_gids) {
-        ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem);
+        ghost_elem_to_nodes[ghost_gid].reserve(intermediate_mesh.num_nodes_in_elem);
     }
     
     // Extract nodes for each ghost element from all_conn
     // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements
     for (int r = 0; r < world_size; ++r) {
         if (r == rank) continue;  // Skip our own data (we already have owned element connectivity)
+        
         int num_pairs = conn_sizes[r] / 2;
         
         // Process pairs in order - each element's nodes are contiguous
-        for (int i = 0; i < num_pairs; ++i) {
+        for (int i = 0; i < num_pairs; i++) {
             int offset = conn_displs[r] + i * 2;
             size_t elem_gid = all_conn[offset];
             size_t node_gid = all_conn[offset + 1];
@@ -1378,9 +1362,9 @@ void partition_mesh(
     
     // Verify each ghost element has the correct number of nodes
     for (auto& pair : ghost_elem_to_nodes) {
-        if (pair.second.size() != static_cast<size_t>(nodes_per_elem)) {
+        if (pair.second.size() != static_cast<size_t>(intermediate_mesh.num_nodes_in_elem)) {
             std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
-                      << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl;
+                      << " has " << pair.second.size() << " nodes, expected " << intermediate_mesh.num_nodes_in_elem << std::endl;
         }
     }
     
@@ -1390,7 +1374,7 @@ void partition_mesh(
     int extended_node_lid = 0;
     
     // Add all owned nodes
-    for (int i = 0; i < intermediate_mesh.num_nodes; ++i) {
+    for (int i = 0; i < intermediate_mesh.num_nodes; i++) {
         size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i);
         node_gid_to_extended_lid[node_gid] = extended_node_lid++;
     }
@@ -1448,7 +1432,7 @@ void partition_mesh(
     std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
     
     // Copy owned element connectivity (convert to extended node LIDs)
-    for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) {
+    for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) {
         extended_nodes_in_elem[lid].reserve(nodes_per_elem);
         for (int j = 0; j < nodes_per_elem; j++) {
             size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
@@ -1505,17 +1489,17 @@ void partition_mesh(
     // Build extended element GID list: owned first, then ghost
     std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
     // Owned elements
-    for (int i = 0; i < intermediate_mesh.num_elems; ++i) {
+    for (int i = 0; i < intermediate_mesh.num_elems; i++) {
         extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i);
     }
     // Ghost elements (in sorted order)
-    for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) {
-        extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx];
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        extended_lid_to_elem_gid[intermediate_mesh.num_elems + i] = ghost_elem_gids_ordered[i];
     }
 
     // Build array: for each ghost element, store which rank owns it (where to receive data from)
     std::vector<int> ghost_elem_owner_ranks(ghost_elem_gids_ordered.size());
-    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
         size_t ghost_gid = ghost_elem_gids_ordered[i];
         auto it = elem_gid_to_rank.find(ghost_gid);
         if (it != elem_gid_to_rank.end()) {
@@ -1529,7 +1513,7 @@ void partition_mesh(
 
     // Create a std::set of all the ranks this rank will receive data from
     std::set<int> ghost_elem_receive_ranks;
-    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) {
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
         ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]);
     }
 
@@ -1595,13 +1579,13 @@ void partition_mesh(
 
     // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
     std::vector<size_t> all_needed_node_gids(total_extended_nodes);
-    for (int i = 0; i < total_extended_nodes; ++i) {
+    for (int i = 0; i < total_extended_nodes; i++) {
         all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
     }
 
     // 2. Build owned node GIDs and their coordinates
     std::vector<size_t> owned_gids(final_mesh.num_owned_nodes);
-    for (int i = 0; i < final_mesh.num_owned_nodes; ++i)
+    for (int i = 0; i < final_mesh.num_owned_nodes; i++)
         owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
 
      // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
@@ -1633,7 +1617,7 @@ void partition_mesh(
 
     // d) Global coords (size: total_owned x 3)
     std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
-    for (int i=0; i<local_owned_count; ++i) {
+    for (int i=0; i<local_owned_count; i++) {
         owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0);
         owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1);
         owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2);
@@ -1654,7 +1638,7 @@ void partition_mesh(
 
     // e) Build map: gid -> coord[3]
     std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
-    for (int i=0; i<total_owned; ++i) {
+    for (int i=0; i<total_owned; i++) {
         std::array<double,3> xyz = {
             all_owned_coords[3*i+0],
             all_owned_coords[3*i+1],
@@ -1664,7 +1648,7 @@ void partition_mesh(
     }
 
     // 4. Finally, fill final_node.coords with correct coordinates.
-    for (int i = 0; i < total_extended_nodes; ++i) {
+    for (int i = 0; i < total_extended_nodes; i++) {
         size_t gid = final_mesh.local_to_global_node_mapping.host(i);
         auto it = gid_to_coord.find(gid);
         if (it != gid_to_coord.end()) {
@@ -1694,7 +1678,7 @@ void partition_mesh(
     // Prepare local ghost list as vector
     std::vector<size_t> ghost_gids_vec;
     ghost_gids_vec.reserve(final_mesh.num_ghost_elems);
-    for (int i = 0; i < final_mesh.num_ghost_elems; ++i) {
+    for (int i = 0; i < final_mesh.num_ghost_elems; i++) {
         ghost_gids_vec.push_back(final_mesh.local_to_global_elem_mapping.host(final_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping
     }
 
@@ -1724,7 +1708,7 @@ void partition_mesh(
     for (int r = 0; r < world_size; ++r) {
         int cnt = ghost_counts[r];
         int off = ghost_displs[r];
-        for (int i = 0; i < cnt; ++i) {
+        for (int i = 0; i < cnt; i++) {
             size_t g = all_ghost_gids[off + i];
             gid_to_ghosting_ranks[g].push_back(r);
         }
@@ -2014,7 +1998,7 @@ void partition_mesh(
     // for (int r = 0; r < world_size; ++r) {
     //     int cnt = ghost_node_counts[r];
     //     int off = ghost_node_displs[r];
-    //     for (int i = 0; i < cnt; ++i) {
+    //     for (int i = 0; i < cnt; i++) {
     //         size_t g = all_ghost_node_gids[off + i];
     //         node_gid_to_ghosting_ranks[g].push_back(r);
     //     }
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 1106d99f..7de5d847 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {180, 180, 180};
+    int num_elems_dim[3] = {100, 100, 100};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
@@ -69,6 +69,9 @@ int main(int argc, char** argv) {
     double t_partition_start = MPI_Wtime();
     partition_mesh(initial_mesh, final_mesh, initial_node, final_node, gauss_point, world_size, rank);
     double t_partition_end = MPI_Wtime();
+    
+    
+    
     if(rank == 0) {
         printf("Mesh partitioning time: %.2f seconds\n", t_partition_end - t_partition_start);
     }

From 8a7de21ee6ddf4f6e822cc5c34d35266d8e714d3 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 7 Nov 2025 16:59:29 -0600
Subject: [PATCH 26/52] DOC: Improving documentation of ghost

---
 examples/mesh_decomp/decomp_utils.h | 221 +++++++++++++++++++---------
 1 file changed, 152 insertions(+), 69 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 3d50e682..ff4f87cd 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -849,7 +849,7 @@ void partition_mesh(
     // Use SCOTCH_STRATQUALITY for best cut quality.
     // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
     // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
-    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01);
+    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001);
 
     // partloctab: output array mapping each local element (vertex) to a *target partition number*
     // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
@@ -1144,16 +1144,71 @@ void partition_mesh(
 
 // ****************************************************************************************** 
 //     Build the ghost elements and nodes
-// ****************************************************************************************** 
+// ================================================================================================**
+//
+// OVERVIEW OF GHOST ELEMENT IDENTIFICATION:
+// ==========================================
+// In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh
+// elements. However, to perform computations that depend on element neighbors or to maintain
+// consistency at domain boundaries, we need ghost elements: copies of elements from neighboring
+// ranks that share nodes with our locally-owned elements.
+//
+// This algorithm identifies and extracts ghost element data in 5 steps:
+//  1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv)
+//  2. Collect local element-node connectivity for distribution
+//  3. Broadcast connectivity to all ranks (via MPI_Allgatherv)
+//  4. Identify which remote elements touch our local elements
+//  5. Extract the full connectivity data for identified ghost elements
+//
+// KEY DATA STRUCTURES:
+//  - elem_gid_to_rank: Map from element global ID to owning rank
+//  - all_elem_gids: Every element GID from every rank (on every rank)
+//  - all_conn: Flattened (elem_gid, node_gid) pairs from every rank (on every rank)
+//  - ghost_elem_gids: Set of remote element GIDs that are ghosts for this rank
+//  - ghost_elem_to_nodes: Map from ghost element GID to its node GIDs
+//
+// WHY THIS APPROACH?
+// - MPI_Allgatherv is efficient for gathering all data to all ranks
+// - Connectivity pairs allow flexible reconstruction of element-node relationships
+// - Using sets and maps for efficient lookups (O(log n) instead of O(n))
+// - Distributed computation avoids a single bottleneck rank
+//
 
     double t_ghost_start = MPI_Wtime();
     
-    // First, gather the number of elements each rank owns
+    // ========================================================================
+    // STEP 1: Gather element ownership information from all ranks
+    // ========================================================================
+    // In a distributed mesh, each rank owns a subset of elements. To identify
+    // ghost elements (elements from other ranks needed by this rank), we need
+    // to know which rank owns each element. This section uses MPI collective
+    // operations to gather element GID ownership information.
+    //
+    // MPI COLLECTIVE OPERATIONS EXPLAINED:
+    // ====================================
+    // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point
+    // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank
+    //   Input: Each rank provides local data
+    //   Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...)
+    // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data
+    //   Input: Each rank provides data of potentially different sizes
+    //   Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes
+    //
+    // COMMUNICATION PATTERN VISUALIZATION:
+    // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...]
+    // Rank 1: elem_count[1] /
+    // Rank 2: elem_count[2] /
+    
+    // MPI_Allgather: Each rank sends its element count, every rank receives
+    // the count from every other rank. Result: elem_counts[r] = number of
+    // elements owned by rank r.
     std::vector<int> elem_counts(world_size);
     MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);  // Synchronize all ranks before proceeding
     
-    // Compute displacements
+    // Compute displacements: offset into the global array for each rank's data
+    // Example: if elem_counts = [100, 150, 120], then
+    // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids)
     std::vector<int> elem_displs(world_size);
     int total_elems = 0;
     for (int r = 0; r < world_size; ++r) {
@@ -1161,13 +1216,18 @@ void partition_mesh(
         total_elems += elem_counts[r];
     }
     
-    // Gather all element GIDs from all ranks
+    // MPI_Allgatherv: Gather variable-sized data from all ranks into one array
+    // Each rank contributes its local_to_global_elem_mapping, which maps
+    // local element indices to global element GIDs. After this call,
+    // all_elem_gids contains ALL element GIDs from all ranks, organized by rank.
     std::vector<size_t> all_elem_gids(total_elems);
     MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
                    all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
-    // Build a map: element GID -> owning rank
+    
+    // Build a lookup map: element GID -> owning rank
+    // This allows O(log n) lookups to determine which rank owns any given element.
     std::map<size_t, int> elem_gid_to_rank;
     for (int r = 0; r < world_size; ++r) {
         for (int i = 0; i < elem_counts[r]; i++) {
@@ -1176,43 +1236,66 @@ void partition_mesh(
         }
     }
     
-    // Strategy: Find elements on other ranks that share
-    // nodes with our locally-owned elements.
+    // ========================================================================
+    // STEP 2: Build element-to-node connectivity for local elements
+    // ========================================================================
+    // Ghost elements are elements from other ranks that share nodes with our
+    // locally-owned elements. To identify them, we need to exchange element-node
+    // connectivity information with all other ranks.
     
-    // First, collect all nodes that belong to our locally-owned elements
+    // Collect all nodes that belong to our locally-owned elements
+    // This set will be used later to check if a remote element is relevant
     std::set<size_t> local_elem_nodes;
-
     for(int node_rid = 0; node_rid < intermediate_mesh.num_nodes; node_rid++) {
         size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_rid);
         local_elem_nodes.insert(node_gid);
     }
     
-    
-    // Now collect element-to-node connectivity to send to all ranks
-    // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid)
+    // ========================================================================
+    // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv
+    // ========================================================================
+    // Build a flattened connectivity array: pairs of (elem_gid, node_gid)
+    // Example for 2 elements with 8 nodes each:
+    //   elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+    //
+    // This format is chosen because it's easy to serialize and deserialize over MPI,
+    // and allows us to reconstruct the full element-node relationships.
     std::vector<size_t> elem_node_conn;
     int local_conn_size = 0;
     
+    // For each locally-owned element, record its GID and all its node GIDs
     for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) {
         size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid);
+        
+        // Access nodes_in_elem[lid][*] to get all nodes in this element
         for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
+            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);  // Local index
+            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);  // Global index
+            
             elem_node_conn.push_back(elem_gid);
             elem_node_conn.push_back(node_gid);
         }
-        local_conn_size += nodes_per_elem * 2;  // Each pair is 2 size_ts
+        local_conn_size += nodes_per_elem * 2;  // Each element contributes (num_nodes_in_elem * 2) size_ts
     }
 
    
     
-    // Exchange element-node connectivity with all ranks using Allgather
-    // First, gather the sizes from each rank
+    // ========================================================================
+    // Perform MPI communication to gather connectivity from all ranks
+    // ========================================================================
+    // Similar to Step 1, we use MPI_Allgatherv to collect all element-node
+    // connectivity pairs. This is a two-stage process:
+    // 1) Gather the size of each rank's connectivity data
+    // 2) Gather the actual connectivity data with proper offsets
+    
+    // Stage 1: Gather connectivity sizes from each rank
+    // conn_sizes[r] = number of size_t values that rank r will send
     std::vector<int> conn_sizes(world_size);
     MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
     
-    // Compute displacements
+    // Compute displacements for the second MPI_Allgatherv call
+    // Displcements tell each rank where its data should be placed in the global array
     std::vector<int> conn_displs(world_size);
     int total_conn = 0;
     for (int r = 0; r < world_size; ++r) {
@@ -1220,72 +1303,59 @@ void partition_mesh(
         total_conn += conn_sizes[r];
     }
     
-    // Gather all element-node pairs from all ranks
+    // Stage 2: Gather all element-node connectivity data
+    // After this call, all_conn contains the flattened connectivity from every rank,
+    // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r])
     std::vector<size_t> all_conn(total_conn);
     MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
                    all_conn.data(), conn_sizes.data(), conn_displs.data(),
                    MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
-
-    
-    DCArrayKokkos<size_t> local_nodes_in_elem(intermediate_mesh.num_elems, intermediate_mesh.num_nodes_in_elem);
-    DCArrayKokkos<size_t> all_nodes_in_elem(total_elems, intermediate_mesh.num_nodes_in_elem);
-
-    std::vector<int> mtr_conn_sizes(world_size);
     
-
-    local_nodes_in_elem = intermediate_mesh.nodes_in_elem;
-    int mtr_size = intermediate_mesh.num_elems * intermediate_mesh.num_nodes_in_elem;
-
-    MPI_Allgather(&mtr_size, 1, MPI_INT, mtr_conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    // Compute displacements
-    std::vector<int> mtr_conn_displs(world_size);
-    int total_mtr_conn = 0;
-    for (int r = 0; r < world_size; ++r) {
-        mtr_conn_displs[r] = total_mtr_conn;
-        total_mtr_conn += mtr_conn_sizes[r];
-    }
-
-
-    MPI_Allgatherv(local_nodes_in_elem.host_pointer(), mtr_size, MPI_UNSIGNED_LONG_LONG,
-                   all_nodes_in_elem.host_pointer(), mtr_conn_sizes.data(), mtr_conn_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-
+    // ========================================================================
+    // STEP 4: Identify ghost elements
+    // ========================================================================
+    // A ghost element is an element owned by another rank that shares at least
+    // one node with our locally-owned elements. This step identifies all such elements.
     
-    // create a set for local_elem_gids
+    // Build a set of locally-owned element GIDs for quick lookup
     std::set<size_t> local_elem_gids;
     for (int i = 0; i < intermediate_mesh.num_elems; i++) {
         local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i));
     }
     
-    // Build a map: node GID -> set of element GIDs that contain it (from other ranks)
+    // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it
+    // This helps us identify which remote elements are adjacent to our local elements
     std::map<size_t, std::set<size_t>> node_to_ext_elem;
+    
+    // Iterate through connectivity data from each rank (except ourselves)
     for (int r = 0; r < world_size; ++r) {
-        if (r == rank) continue;  // Skip our own data
-        // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
         int num_pairs = conn_sizes[r] / 2;
+        
         for (int i = 0; i < num_pairs; i++) {
-            // Each pair is 2 size_ts, starting at conn_displs[r]
+            // Offset into all_conn for this pair (elem_gid, node_gid)
             int offset = conn_displs[r] + i * 2;
             size_t elem_gid = all_conn[offset];
             size_t node_gid = all_conn[offset + 1];
             
-            // If this node is in one of our elements, then the element is a potential ghost
+            // Check if this node belongs to one of our locally-owned elements
             if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
-                // Check if this element is not owned by us
+                // Check if this element is NOT owned by us (i.e., it's from another rank)
                 if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
+                    // This is a ghost element for us
                     node_to_ext_elem[node_gid].insert(elem_gid);
                 }
             }
         }
     }
     
-    // Collect all unique ghost element GIDs
+    // Extract all unique ghost element GIDs
+    // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us)
     std::set<size_t> ghost_elem_gids;
     for (const auto& pair : node_to_ext_elem) {
         for (size_t elem_gid : pair.second) {
@@ -1316,7 +1386,7 @@ void partition_mesh(
     //     }
     // }
     
-    // Count unique ghost elements
+    // Store the count of ghost elements for later use
     intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
     
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1326,33 +1396,43 @@ void partition_mesh(
         std::cout << " Finished calculating ghost elements" << std::endl;
         std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
     }
-    // Build the connectivity that includes ghost elements
-    // Create an extended mesh with owned elements first, then ghost elements appended
+    
+    // ========================================================================
+    // STEP 5: Extract ghost element connectivity
+    // ========================================================================
+    // Now that we know which elements are ghosts, we need to extract their
+    // full node connectivity from all_conn. This allows us to properly construct
+    // the extended mesh with ghost elements included.
     
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
     
-    // Step 1: Extract ghost element-node connectivity from all_conn
-    // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn)
+    // Build a map: ghost_elem_gid -> vector of node_gids
+    // We pre-allocate the vector size to avoid repeated reallocations
     std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
     for (const size_t& ghost_gid : ghost_elem_gids) {
         ghost_elem_to_nodes[ghost_gid].reserve(intermediate_mesh.num_nodes_in_elem);
     }
     
-    // Extract nodes for each ghost element from all_conn
-    // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements
+    // ========================================================================
+    // Extract nodes for each ghost element from the globally-collected all_conn
+    // ========================================================================
+    // The all_conn array was populated by MPI_Allgatherv and contains connectivity
+    // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse
+    // this data to extract the nodes for each ghost element.
     for (int r = 0; r < world_size; ++r) {
-        if (r == rank) continue;  // Skip our own data (we already have owned element connectivity)
+        if (r == rank) continue;  // Skip our own data - we already have owned element connectivity
         
+        // Parse connectivity data for rank r
         int num_pairs = conn_sizes[r] / 2;
         
-        // Process pairs in order - each element's nodes are contiguous
         for (int i = 0; i < num_pairs; i++) {
+            // Calculate offset for this pair: displacement + (pair_index * 2)
             int offset = conn_displs[r] + i * 2;
             size_t elem_gid = all_conn[offset];
             size_t node_gid = all_conn[offset + 1];
             
-            // If this is one of our ghost elements, record its node (in order)
+            // If this element is one of our identified ghost elements, record its node
             auto it = ghost_elem_to_nodes.find(elem_gid);
             if (it != ghost_elem_to_nodes.end()) {
                 it->second.push_back(node_gid);
@@ -1360,7 +1440,10 @@ void partition_mesh(
         }
     }
     
-    // Verify each ghost element has the correct number of nodes
+    // ========================================================================
+    // Validation: Verify each ghost element has the correct number of nodes
+    // ========================================================================
+    // This catch detects issues in the MPI communication or parsing logic
     for (auto& pair : ghost_elem_to_nodes) {
         if (pair.second.size() != static_cast<size_t>(intermediate_mesh.num_nodes_in_elem)) {
             std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 

From accd023660cf07719e661c7ce676ea0c0bd82999 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 10 Nov 2025 11:01:12 -0600
Subject: [PATCH 27/52] STYLE: Tidying up, and testing with vtk read mesh

---
 examples/mesh_decomp/decomp_utils.h  |  59 +++----
 examples/mesh_decomp/mesh_decomp.cpp |   4 +
 examples/mesh_decomp/mesh_io.h       | 254 +++++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 30 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index ff4f87cd..dada0d99 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -909,7 +909,7 @@ void partition_mesh(
 
     // -------------- Phase 2: Exchange element GIDs --------------
     std::vector<int> sendcounts(world_size), recvcounts(world_size);
-    for (int r = 0; r < world_size; ++r)
+    for (int r = 0; r < world_size; r++)
         sendcounts[r] = static_cast<int>(elems_to_send[r].size());
 
     MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
@@ -919,7 +919,7 @@ void partition_mesh(
     // Compute displacements
     std::vector<int> sdispls(world_size), rdispls(world_size);
     int send_total = 0, recv_total = 0;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         sdispls[r] = send_total;
         rdispls[r] = recv_total;
         send_total += sendcounts[r];
@@ -932,7 +932,7 @@ void partition_mesh(
     // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning.
     std::vector<int> send_elems;
     send_elems.reserve(send_total);
-    for (int r = 0; r < world_size; ++r)
+    for (int r = 0; r < world_size; r++)
         send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end());
 
     // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange.
@@ -983,7 +983,7 @@ void partition_mesh(
 
     std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
     int conn_send_total = 0, conn_recv_total = 0;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         conn_sdispls[r] = conn_send_total;
         conn_rdispls[r] = conn_recv_total;
         conn_send_total += conn_sendcounts[r];
@@ -1013,7 +1013,7 @@ void partition_mesh(
 
     // -------------- Phase 5: Request node coordinates --------------
     std::vector<double> node_coords_sendbuf;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         for (int gid : elems_to_send[r]) {
             int lid = -1;
             for (int i = 0; i < naive_mesh.num_elems; i++)
@@ -1032,7 +1032,7 @@ void partition_mesh(
 
     // Each node is 3 doubles; same sendcounts scaling applies
     std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
-    for (int r = 0; r < world_size; ++r)
+    for (int r = 0; r < world_size; r++)
         coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
 
     MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
@@ -1041,7 +1041,7 @@ void partition_mesh(
 
     std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
     int coord_send_total = 0, coord_recv_total = 0;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         coord_sdispls[r] = coord_send_total;
         coord_rdispls[r] = coord_recv_total;
         coord_send_total += coord_sendcounts[r];
@@ -1211,7 +1211,7 @@ void partition_mesh(
     // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids)
     std::vector<int> elem_displs(world_size);
     int total_elems = 0;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         elem_displs[r] = total_elems;
         total_elems += elem_counts[r];
     }
@@ -1229,7 +1229,7 @@ void partition_mesh(
     // Build a lookup map: element GID -> owning rank
     // This allows O(log n) lookups to determine which rank owns any given element.
     std::map<size_t, int> elem_gid_to_rank;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         for (int i = 0; i < elem_counts[r]; i++) {
             size_t gid = all_elem_gids[elem_displs[r] + i];
             elem_gid_to_rank[gid] = r;
@@ -1298,7 +1298,7 @@ void partition_mesh(
     // Displcements tell each rank where its data should be placed in the global array
     std::vector<int> conn_displs(world_size);
     int total_conn = 0;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         conn_displs[r] = total_conn;
         total_conn += conn_sizes[r];
     }
@@ -1329,7 +1329,7 @@ void partition_mesh(
     std::map<size_t, std::set<size_t>> node_to_ext_elem;
     
     // Iterate through connectivity data from each rank (except ourselves)
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         if (r == rank) continue;  // Skip our own data - we already know our elements
         
         // Parse the connectivity data for rank r
@@ -1390,12 +1390,7 @@ void partition_mesh(
     intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
     
     MPI_Barrier(MPI_COMM_WORLD);
-    double t_ghost_end = MPI_Wtime();
     
-    if (rank == 0) {
-        std::cout << " Finished calculating ghost elements" << std::endl;
-        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
-    }
     
     // ========================================================================
     // STEP 5: Extract ghost element connectivity
@@ -1420,7 +1415,7 @@ void partition_mesh(
     // The all_conn array was populated by MPI_Allgatherv and contains connectivity
     // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse
     // this data to extract the nodes for each ghost element.
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         if (r == rank) continue;  // Skip our own data - we already have owned element connectivity
         
         // Parse connectivity data for rank r
@@ -1545,7 +1540,7 @@ void partition_mesh(
     MPI_Barrier(MPI_COMM_WORLD);
     // Sequential rank-wise printing of extended mesh structure info
     if(print_info) {
-        for (int r = 0; r < world_size; ++r) {
+        for (int r = 0; r < world_size; r++) {
             MPI_Barrier(MPI_COMM_WORLD);
             if (rank == r) {
                 std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
@@ -1636,6 +1631,13 @@ void partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    double t_ghost_end = MPI_Wtime();
+    
+    if (rank == 0) {
+        std::cout << " Finished calculating ghost elements" << std::endl;
+        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
+    }
+
     final_mesh.nodes_in_elem.update_device();
     final_mesh.build_connectivity();
 
@@ -1686,7 +1688,7 @@ void partition_mesh(
     // b) Displacements and total
     std::vector<int> owned_displs(world_size,0);
     int total_owned = 0;
-    for (int r=0; r<world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         owned_displs[r] = total_owned;
         total_owned += owned_counts[r];
     }
@@ -1700,7 +1702,7 @@ void partition_mesh(
 
     // d) Global coords (size: total_owned x 3)
     std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
-    for (int i=0; i<local_owned_count; i++) {
+    for (int i = 0; i < local_owned_count; i++) {
         owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0);
         owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1);
         owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2);
@@ -1710,7 +1712,7 @@ void partition_mesh(
     // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
     std::vector<int> coord_counts(world_size);
     std::vector<int> coord_displs(world_size);
-    for (int r=0; r<world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
         coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
     }
@@ -1721,7 +1723,7 @@ void partition_mesh(
 
     // e) Build map: gid -> coord[3]
     std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
-    for (int i=0; i<total_owned; i++) {
+    for (int i = 0; i < total_owned; i++) {
         std::array<double,3> xyz = {
             all_owned_coords[3*i+0],
             all_owned_coords[3*i+1],
@@ -1749,7 +1751,7 @@ void partition_mesh(
 
 
     // --------------------------------------------------------------------------------------
-// Build the send patterns for elements
+    // Build the send patterns for elements
     // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
     // Steps:
     // 1) Each rank contributes its ghost element GIDs.
@@ -1773,7 +1775,7 @@ void partition_mesh(
     // Displacements and recv buffer
     std::vector<int> ghost_displs(world_size, 0);
     int total_ghosts = 0;
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         ghost_displs[r] = total_ghosts;
         total_ghosts += ghost_counts[r];
     }
@@ -1788,7 +1790,7 @@ void partition_mesh(
     // Build map gid -> ranks that ghost it
     std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
     gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
-    for (int r = 0; r < world_size; ++r) {
+    for (int r = 0; r < world_size; r++) {
         int cnt = ghost_counts[r];
         int off = ghost_displs[r];
         for (int i = 0; i < cnt; i++) {
@@ -2057,7 +2059,7 @@ void partition_mesh(
     // // Displacements and recv buffer
     // std::vector<int> ghost_node_displs(world_size, 0);
     // int total_ghost_nodes = 0;
-    // for (int r = 0; r < world_size; ++r) {
+    // for (int r = 0; r < world_size; r++) {
     //     ghost_node_displs[r] = total_ghost_nodes;
     //     total_ghost_nodes += ghost_node_counts[r];
     // }
@@ -2078,7 +2080,7 @@ void partition_mesh(
     // // Build map node_gid -> ranks that ghost it
     // std::unordered_map<size_t, std::vector<int>> node_gid_to_ghosting_ranks;
     // node_gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghost_nodes));
-    // for (int r = 0; r < world_size; ++r) {
+    // for (int r = 0; r < world_size; r++) {
     //     int cnt = ghost_node_counts[r];
     //     int off = ghost_node_displs[r];
     //     for (int i = 0; i < cnt; i++) {
@@ -2136,9 +2138,6 @@ void partition_mesh(
     // MPI_Barrier(MPI_COMM_WORLD);
     // if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
 
-
-
-
 }
 
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 7de5d847..88727e2e 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -58,6 +58,10 @@ int main(int argc, char** argv) {
         std::cout<<"Initializing mesh"<<std::endl;
         build_3d_box(initial_mesh,  initial_node, origin, length, num_elems_dim);
 
+
+        // Read the mesh from a file
+        // read_vtk_mesh(initial_mesh, initial_node, 3, "meshes/buste.vtk");
+
         double t_init_mesh_end = MPI_Wtime();
         std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl;
     }
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 8170e531..e6fc65de 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -18,10 +18,37 @@ using namespace mtr;
 #include <vector>
 #include <string>   
 #include <mpi.h>
+#include <string>
 
 
 
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn split
+///
+/// \brief Splits a string by a given delimiter
+///
+/// \param Input string
+/// \param delimiter
+///
+/// \return Vector of split string values
+///
+/////////////////////////////////////////////////////////////////////////////
+inline std::vector<std::string> split(std::string s, std::string delimiter)
+{
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    std::string token;
+    std::vector<std::string> res;
+
+    while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+        token     = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+    }
 
+    res.push_back(s.substr(pos_start));
+    return res;
+} // end of split
 
 /////////////////////////////////////////////////////////////////////////////
 ///
@@ -777,4 +804,231 @@ void write_vtu(Mesh_t& mesh,
 
 } // end write_vtu
 
+
+ /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_vtk_mesh
+    ///
+    /// \brief Read ASCII .vtk mesh file
+    ///
+    /// \param Simulation mesh
+    /// \param Simulation state
+    /// \param Node state struct
+    /// \param Number of dimensions
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_vtk_mesh(Mesh_t& mesh,
+        node_t&   node,
+        int num_dims,
+        std::string mesh_file_)
+{
+
+    std::cout<<"Reading VTK mesh"<<std::endl;
+
+    int i;           // used for writing information to file
+    int node_gid;    // the global id for the point
+    int elem_gid;     // the global id for the elem
+
+    size_t num_nodes_in_elem = 1;
+    for (int dim = 0; dim < num_dims; dim++) {
+        num_nodes_in_elem *= 2;
+    }
+
+
+    std::string token;
+
+    bool found = false;
+
+    std::ifstream in;  // FILE *in;
+    in.open(mesh_file_);
+
+
+    // look for POINTS
+    i = 0;
+    while (found==false) {
+        std::string str;
+        std::string delimiter = " ";
+        std::getline(in, str);
+        std::vector<std::string> v = split (str, delimiter);
+
+        // looking for the following text:
+        //      POINTS %d float
+        if(v[0] == "POINTS"){
+            size_t num_nodes = std::stoi(v[1]);
+            printf("Number of nodes read in %zu\n", num_nodes);
+            mesh.initialize_nodes(num_nodes);
+
+            std::vector<node_state> required_node_state = { node_state::coords };
+            node.initialize(num_nodes, num_dims, required_node_state);
+            
+            found=true;
+        } // end if
+
+
+        if (i>1000){
+            std::cerr << "ERROR: Failed to find POINTS in file" << std::endl;
+            break;
+        } // end if
+
+        i++;
+    } // end while
+
+    // read the node coordinates
+    for (node_gid=0; node_gid<mesh.num_nodes; node_gid++){
+
+        std::string str;
+        std::getline(in, str);
+
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+
+        // save the nodal coordinates
+        node.coords.host(node_gid, 0) = std::stod(v[0]); // double
+        node.coords.host(node_gid, 1) = std::stod(v[1]); // double
+        if(num_dims==3){
+            node.coords.host(node_gid, 2) = std::stod(v[2]); // double
+        }
+
+    } // end for nodes
+
+
+    // Update device nodal positions
+    node.coords.update_device();
+
+
+    found=false;
+
+    // look for CELLS
+    i = 0;
+    size_t num_elem = 0;
+    while (found==false) {
+    std::string str;
+    std::getline(in, str);
+
+    std::string delimiter = " ";
+    std::vector<std::string> v = split (str, delimiter);
+    std::cout << v[0] << std::endl; // printing
+
+    // looking for the following text:
+    //      CELLS num_elem size
+    if(v[0] == "CELLS"){
+        num_elem = std::stoi(v[1]);
+        printf("Number of elements read in %zu\n", num_elem);
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elem, num_dims);
+        
+        found=true;
+    } // end if
+
+
+    if (i>1000){
+        printf("ERROR: Failed to find CELLS \n");
+        break;
+    } // end if
+
+    i++;
+    } // end while
+
+
+    // read the node ids in the element
+    for (elem_gid=0; elem_gid<num_elem; elem_gid++) {
+
+    std::string str;
+    std::getline(in, str);
+
+    std::string delimiter = " ";
+    std::vector<std::string> v = split (str, delimiter);
+    num_nodes_in_elem = std::stoi(v[0]);
+
+    for (size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++){
+        mesh.nodes_in_elem.host(elem_gid, node_lid) = std::stod(v[node_lid+1]);
+        //printf(" %zu ", elem_point_list(elem_gid,node_lid) ); // printing
+    }
+    //printf("\n"); // printing
+
+    } // end for
+
+    // Convert from ensight to IJK mesh
+    size_t convert_ensight_to_ijk[8];
+    convert_ensight_to_ijk[0] = 0;
+    convert_ensight_to_ijk[1] = 1;
+    convert_ensight_to_ijk[2] = 3;
+    convert_ensight_to_ijk[3] = 2;
+    convert_ensight_to_ijk[4] = 4;
+    convert_ensight_to_ijk[5] = 5;
+    convert_ensight_to_ijk[6] = 7;
+    convert_ensight_to_ijk[7] = 6;
+
+    size_t tmp_ijk_indx[8];
+
+    for (size_t elem_gid = 0; elem_gid < num_elem; elem_gid++) {
+    for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+        tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]);
+    }
+
+    for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){
+        mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid];
+    }
+    }
+    // update device side
+    mesh.nodes_in_elem.update_device();
+
+
+    // initialize corner variables
+    size_t num_corners = num_elem * num_nodes_in_elem;
+    mesh.initialize_corners(num_corners);
+
+
+    // Build connectivity
+    mesh.build_connectivity();
+
+
+    found=false;
+
+    printf("\n");
+
+
+    // look for CELL_TYPE
+    i = 0;
+    size_t elem_type = 0;
+    while (found==false) {
+    std::string str;
+    std::string delimiter = " ";
+    std::getline(in, str);
+    std::vector<std::string> v = split (str, delimiter);
+
+    // looking for the following text:
+    //      CELLS num_elem size
+    if(v[0] == "CELL_TYPES"){
+
+        std::getline(in, str);
+        elem_type = std::stoi(str);
+        
+        found=true;
+    } // end if
+
+
+    if (i>1000){
+        printf("ERROR: Failed to find elem_TYPE \n");
+        break;
+    } // end if
+
+    i++;
+    } // end while
+    printf("Element type = %zu \n", elem_type);
+    // elem types:
+    // linear hex = 12, linear quad = 9
+    found=false;
+
+
+    if(num_nodes_in_elem==8 & elem_type != 12) {
+    printf("Wrong element type of %zu \n", elem_type);
+    std::cerr << "ERROR: incorrect element type in VTK file" << std::endl;
+    }
+
+    in.close();
+
+} // end of VTKread function
+
 #endif
\ No newline at end of file

From 4671064a27a0f01c4f4849be2df78667fa339c7d Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 10 Nov 2025 16:19:45 -0600
Subject: [PATCH 28/52] BUILD: Getting everything building with new examples

---
 examples/CMakeLists.txt                       | 157 +++++-----
 examples/mesh_decomp/CMakeLists.txt           |   1 +
 examples/mesh_decomp/communication_plan.h     | 292 +++++++++++-------
 examples/mesh_decomp/decomp_utils.h           |  61 +++-
 examples/mesh_decomp/mpi_type.h               |  50 +--
 ...cation_plan.h => communication_plan_old.h} |   0
 src/include/mapped_mpi_types.h                |   2 +-
 7 files changed, 324 insertions(+), 239 deletions(-)
 rename src/include/{communication_plan.h => communication_plan_old.h} (100%)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e32ddb2d..4c379334 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -88,117 +88,118 @@ if (KOKKOS)
     add_definitions(-DHAVE_THREADS=1)
   endif()
 
-  # add_executable(testsetval test_set_values.cpp)
-  # target_link_libraries(testsetval ${LINKING_LIBRARIES})
+  add_executable(testsetval test_set_values.cpp)
+  target_link_libraries(testsetval ${LINKING_LIBRARIES})
 
-  # add_executable(mtestkokkos main_kokkos.cpp)
-  # target_link_libraries(mtestkokkos ${LINKING_LIBRARIES})
+  add_executable(mtestkokkos main_kokkos.cpp)
+  target_link_libraries(mtestkokkos ${LINKING_LIBRARIES})
 
 
-  # add_executable(drrak_test test_drrak.cpp)
-  # target_link_libraries(drrak_test ${LINKING_LIBRARIES})
+  add_executable(drrak_test test_drrak.cpp)
+  target_link_libraries(drrak_test ${LINKING_LIBRARIES})
 
-  # add_executable(test_kokkos_for kokkos_for.cpp)
-  # target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES})
+  add_executable(test_kokkos_for kokkos_for.cpp)
+  target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES})
 
-  # add_executable(test_dual_types test_dual_types.cpp)
-  # target_link_libraries(test_dual_types ${LINKING_LIBRARIES})
+  add_executable(test_dual_types test_dual_types.cpp)
+  target_link_libraries(test_dual_types ${LINKING_LIBRARIES})
 
-  # add_executable(kokkos_csr CSRKokkos.cpp)
-  # target_link_libraries(kokkos_csr ${LINKING_LIBRARIES})
+  add_executable(kokkos_csr CSRKokkos.cpp)
+  target_link_libraries(kokkos_csr ${LINKING_LIBRARIES})
 
-  # add_executable(kokkos_csc CSCKokkos.cpp)
-  # target_link_libraries(kokkos_csc ${LINKING_LIBRARIES})
+  add_executable(kokkos_csc CSCKokkos.cpp)
+  target_link_libraries(kokkos_csc ${LINKING_LIBRARIES})
 
-  # add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp)
-  # target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES})
+  add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp)
+  target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES})
 
-  # add_executable(annkokkos ann_kokkos.cpp)
-  # target_link_libraries(annkokkos ${LINKING_LIBRARIES})
+  add_executable(annkokkos ann_kokkos.cpp)
+  target_link_libraries(annkokkos ${LINKING_LIBRARIES})
 
-  # add_executable(annkokkos_compare ann_kokkos_compare.cpp)
-  # target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES})
+  add_executable(annkokkos_compare ann_kokkos_compare.cpp)
+  target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES})
 
-  # #add_executable(ompperftest ompperftest.cpp)
-  # #target_link_libraries(ompperftest ${LINKING_LIBRARIES})
+  #add_executable(ompperftest ompperftest.cpp)
+  #target_link_libraries(ompperftest ${LINKING_LIBRARIES})
 
-  # add_executable(lu_test test_lu_solve.cpp)
-  # target_link_libraries(lu_test ${LINKING_LIBRARIES})
+  add_executable(lu_test test_lu_solve.cpp)
+  target_link_libraries(lu_test ${LINKING_LIBRARIES})
 
-  # add_executable(qr_test test_qr_solve.cpp)
-  # target_link_libraries(qr_test ${LINKING_LIBRARIES})
+  add_executable(qr_test test_qr_solve.cpp)
+  target_link_libraries(qr_test ${LINKING_LIBRARIES})
 
-  # if (Matar_ENABLE_TRILINOS)
-  #   add_executable(anndistributed ann_distributed.cpp)
-  #   target_link_libraries(anndistributed ${LINKING_LIBRARIES})
+  if (Matar_ENABLE_TRILINOS)
+    add_executable(anndistributed ann_distributed.cpp)
+    target_link_libraries(anndistributed ${LINKING_LIBRARIES})
     
-  #   add_executable(anndistributed_crs ann_distributed_crs.cpp)
-  #   target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES})
+    add_executable(anndistributed_crs ann_distributed_crs.cpp)
+    target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES})
+
+    add_executable(test_tpetra_farray test_tpetra_farray.cpp)
+    target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES})
 
-  #   add_executable(test_tpetra_farray test_tpetra_farray.cpp)
-  #   target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES})
+    add_executable(test_tpetra_carray test_tpetra_carray.cpp)
+    target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES})
 
-  #   add_executable(test_tpetra_carray test_tpetra_carray.cpp)
-  #   target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES})
+    add_executable(test_tpetra_crs test_tpetra_crs.cpp)
+    target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES})
 
-  #   add_executable(test_tpetra_crs test_tpetra_crs.cpp)
-  #   target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES})
+    add_executable(test_tpetra_mesh test_tpetra_mesh.cpp)
+    target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES})
+  endif()
 
-  #   add_executable(test_tpetra_mesh test_tpetra_mesh.cpp)
-  #   target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES})
-  # endif()
+  if (OPENMP)
+    add_executable(parallel_hello_world parallel_hello_world.cpp)
+    target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES})
+  endif()
 
-  # if (OPENMP)
-  #   add_executable(parallel_hello_world parallel_hello_world.cpp)
-  #   target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES})
-  # endif()
+  if (MPI)
+    include_directories(laplaceMPI)
+    add_subdirectory(laplaceMPI)
 
-  # if (MPI)
-  #   include_directories(laplaceMPI)
-  #   add_subdirectory(laplaceMPI)
-  # endif()
+    include_directories(mesh_decomp)
+    add_subdirectory(mesh_decomp)
+  endif()
 
 endif()
 
-# ### HIP Linking error, will add back in after fixed
-# if (NOT HIP)
-#     include_directories(virtualFcnKokkos)
-#     add_subdirectory(virtualFcnKokkos)
-# endif()
+### HIP Linking error, will add back in after fixed
+if (NOT HIP)
+    include_directories(virtualFcnKokkos)
+    add_subdirectory(virtualFcnKokkos)
+endif()
 
-# # In testing, not working
-# #include_directories(gArrayofgArrays)
-# #add_subdirectory(gArrayofgArrays)
+# In testing, not working
+#include_directories(gArrayofgArrays)
+#add_subdirectory(gArrayofgArrays)
 
-# include_directories(virtualFcnMATAR)
-# add_subdirectory(virtualFcnMATAR)
+include_directories(virtualFcnMATAR)
+add_subdirectory(virtualFcnMATAR)
 
-# include_directories(laplace)
-# add_subdirectory(laplace)
+include_directories(laplace)
+add_subdirectory(laplace)
 
-# include_directories(halfspace_cooling)
-# add_subdirectory(halfspace_cooling)
+include_directories(halfspace_cooling)
+add_subdirectory(halfspace_cooling)
 
-# include_directories(watt-graph)
-# add_subdirectory(watt-graph)
+include_directories(watt-graph)
+add_subdirectory(watt-graph)
 
-# #include_directories(matar_fortran)
-# #add_subdirectory(matar_fortran)
+#include_directories(matar_fortran)
+#add_subdirectory(matar_fortran)
 
-# include_directories(sparsetests)
-# add_subdirectory(sparsetests)
+include_directories(sparsetests)
+add_subdirectory(sparsetests)
 
-# include_directories(test_rocm)
-# add_subdirectory(test_rocm)
+include_directories(test_rocm)
+add_subdirectory(test_rocm)
 
-include_directories(mesh_decomp)
-add_subdirectory(mesh_decomp)
 
-#include_directories(phaseField/srcKokkosVerbose)
-#add_subdirectory(phaseField/srcKokkosVerbose)
+# include_directories(phaseField/srcKokkosVerbose)
+# add_subdirectory(phaseField/srcKokkosVerbose)
 
-#include_directories(phaseField/srcMacros)
-#add_subdirectory(phaseField/srcMacros)
+# include_directories(phaseField/srcMacros)
+# add_subdirectory(phaseField/srcMacros)
 
-#include_directories(phaseFieldMPI)
-#add_subdirectory(phaseFieldMPI)
+# include_directories(phaseFieldMPI)
+# add_subdirectory(phaseFieldMPI)
diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt
index 7b7306cd..b5ea83ca 100644
--- a/examples/mesh_decomp/CMakeLists.txt
+++ b/examples/mesh_decomp/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.1.3)
 
 # Find MPI
 find_package(MPI REQUIRED)
+add_definitions(-DHAVE_MPI=1)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h
index eabba8da..2023f609 100644
--- a/examples/mesh_decomp/communication_plan.h
+++ b/examples/mesh_decomp/communication_plan.h
@@ -1,6 +1,8 @@
 #ifndef COMMUNICATION_PLAN_H
 #define COMMUNICATION_PLAN_H
 
+#ifdef HAVE_MPI
+#include <mpi.h>
 #include "matar.h"
 
 using namespace mtr;
@@ -58,7 +60,6 @@ using namespace mtr;
     // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs.
     int reorder = 0; 
 
-
     DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank
     DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank
 
@@ -69,12 +70,9 @@ using namespace mtr;
     DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
     DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
 
-    int total_send_count;
-    int total_recv_count;
+    int total_send_count;   // Total number of items to send
+    int total_recv_count;   // Total number of items to receive
 
-    
-    
-    
     // ========================================================================
     // CONSTRUCTOR / INITIALIZATION
     // ========================================================================
@@ -99,148 +97,218 @@ using namespace mtr;
         MPI_Comm_size(comm_world, &world_size);
     }
     
+    /**
+     * @brief Initialize an MPI distributed graph communicator for sparse neighbor communication.
+     *
+     * This function creates an MPI "dist graph communicator" tailored to the sparse data exchange
+     * patterns typical in mesh-based parallel applications. It establishes direct knowledge for MPI
+     * about which processes (ranks) each process will communicate with. This improves the efficiency 
+     * and clarity of later communication (for example, with MPI_Neighbor_alltoallv).
+     *
+     * This function is especially useful when the communication pattern is not all-to-all, but rather
+     * a sparse subset: for instance, where each process only exchanges data with a few neighbors.
+     *
+     * ==== Key Concepts ====
+     * - MPI Communicator:  An MPI object representing a group of processes that can communicate with each other.
+     *   For context, "MPI_COMM_WORLD" is a communicator including all processes, but a graph communicator
+     *   customizes direct process connections.
+     * - Rank:              Integer ID identifying a process in a communicator.
+     * - Distributed Graph: MPI can represent communication as a directed sparse graph, with edges from
+     *   this rank to those it needs to send to, and from those it will receive from.
+     *
+     * ==== Parameters ====
+     * @param num_send_ranks   [in] Number of ranks this process will send data to (out-neighbors).
+     * @param send_rank_ids    [in] Array of size num_send_ranks; each entry is the rank of a process to send to.
+     * @param num_recv_ranks   [in] Number of ranks this process will receive data from (in-neighbors).
+     * @param recv_rank_ids    [in] Array of size num_recv_ranks; each entry is the rank of a process to receive from.
+     *
+     * ==== Steps ====
+     *
+     * 1. Checks if the basic communicator has been initialized.
+     *    Throws an error if it has not.
+     *
+     * 2. Stores the send/receive neighbor counts and rank lists internally.
+     *    Copies the IDs into the internal device-host arrays.
+     *      - send_rank_ids: process IDs that will be destinations for outgoing messages.
+     *      - recv_rank_ids: process IDs that will provide incoming messages.
+     *
+     * 3. Calls MPI_Dist_graph_create_adjacent:
+     *    This constructs a new MPI communicator ("mpi_comm_graph") that encodes this process's
+     *    inbound and outbound neighbors. MPI uses this to optimize and route messages directly
+     *    and efficiently during later neighbor collectives.
+     *
+     *    - Note: The 'recv_weights' and 'send_weights' arguments are set to NULL here;
+     *            this means we are not giving extra weighting or priorities to any connection.
+     *    - The 'reorder' argument (set to 0 in this class) disables rank reordering;
+     *      this ensures the assignment of process ranks is preserved, which is often needed
+     *      for mapping data or results back to physical entities.
+     *    - On return, 'mpi_comm_graph' will allow use of "neighbor" collectives (MPI_Neighbor_alltoall[v], etc.),
+     *      which automatically use the provided topology to send/receive to only neighbors efficiently.
+     *
+     * 4. Marks the internal flag indicating that the graph communicator has been set up ("has_comm_graph").
+     *
+     * ==== Example Usage ====
+     * Suppose rank 0 will send to ranks 1 and 2, and receive from rank 3 only:
+     *    int send_ranks[2] = {1, 2};
+     *    int recv_ranks[1] = {3};
+     *    initialize_graph_communicator(2, send_ranks, 1, recv_ranks);
+     *
+     * ==== Why Use This? ====
+     * - This avoids the need to do manual pairwise MPI_Send/MPI_Recv in your code, 
+     *   and enables the use of neighbor collectives -- concise, scalable, and hard-to-get-wrong.
+     * - It explicitly tells MPI only about your neighbors, so it can optimize routes and memory.
+     * - If you have a large number of processes or a mesh/network with only local coupling,
+     *   this approach scales much better than using global/all-to-all communication.
+     *
+     * @throws std::runtime_error if the base communicator has not been initialized.
+     */
     void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){
         
+        // Check if the MPI_COMM_WORLD communicator has been initialized.
         if(!has_comm_world){
             throw std::runtime_error("MPI communicator for the world has not been initialized");
         }
         
+        // Store the number of outbound and inbound neighbors
         this->num_send_ranks = num_send_ranks;
         this->num_recv_ranks = num_recv_ranks;
         
+        // Copy and store send neighbor IDs (out-bound neighbors: where we will send data to)
         this->send_rank_ids = DCArrayKokkos<int>(num_send_ranks, "send_rank_ids");
         for(int i = 0; i < num_send_ranks; i++){
             this->send_rank_ids(i) = send_rank_ids[i];
         }
 
-
+        // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from)
         this->recv_rank_ids = DCArrayKokkos<int>(num_recv_ranks, "recv_rank_ids");
         for(int i = 0; i < num_recv_ranks; i++){
             this->recv_rank_ids(i) = recv_rank_ids[i];
         }
         
+        // Create the distributed graph communicator.
+        // This call links this process to its explicit send and receive neighbors.
+        // See https://www.open-mpi.org/doc/v4.0/man3/MPI_Dist_graph_create_adjacent.3.php for more details.
         MPI_Dist_graph_create_adjacent(
-            mpi_comm_world,
-            num_recv_ranks,
-            this->recv_rank_ids.host_pointer(),
-            recv_weights,
-            num_send_ranks,
-            this->send_rank_ids.host_pointer(),
-            send_weights,
-            info,
-            reorder,
-            &mpi_comm_graph
+            mpi_comm_world,                                       // Existing communicator (usually MPI_COMM_WORLD)
+            num_recv_ranks,                                       // Number of in-neighbors (recv)
+            this->recv_rank_ids.host_pointer(),                   // Array of in-neighbor ranks (who we receive from)
+            recv_weights,                                         // Edge weights (NULL = unweighted)
+            num_send_ranks,                                       // Number of out-neighbors (send)
+            this->send_rank_ids.host_pointer(),                   // Array of out-neighbor ranks (who we send to)
+            send_weights,                                         // Edge weights (NULL = unweighted)
+            info,                                                 // Additional info for MPI (not used, set to MPI_INFO_NULL)
+            reorder,                                              // Allow MPI to reorder ranks for performance (0 disables)
+            &mpi_comm_graph                                       // [out] New graph communicator
         );
 
+        // Set the internal flag indicating that we have created the MPI distributed graph communicator.
         has_comm_graph = true;
     }
 
-    void verify_graph_communicator(){
-        if(!has_comm_graph){
-            throw std::runtime_error("MPI graph communicator has not been initialized");
-        }
-
-        // ============================================================================
-        // Verify the distributed graph communicator
-        // ============================================================================
-        // Query the graph to verify it matches what we specified
-        int indegree_out, outdegree_out, weighted;
-        MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted);
+    // void verify_graph_communicator(){
+    //     if(!has_comm_graph){
+    //         throw std::runtime_error("MPI graph communicator has not been initialized");
+    //     }
+
+    //     // ============================================================================
+    //     // Verify the distributed graph communicator
+    //     // ============================================================================
+    //     // Query the graph to verify it matches what we specified
+    //     int indegree_out, outdegree_out, weighted;
+    //     MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted);
         
-        // Allocate arrays to receive neighbor information
-        std::vector<int> sources_out(indegree_out);
-        std::vector<int> sourceweights_out(indegree_out);
-        std::vector<int> destinations_out(outdegree_out);
-        std::vector<int> destweights_out(outdegree_out);
+    //     // Allocate arrays to receive neighbor information
+    //     std::vector<int> sources_out(indegree_out);
+    //     std::vector<int> sourceweights_out(indegree_out);
+    //     std::vector<int> destinations_out(outdegree_out);
+    //     std::vector<int> destweights_out(outdegree_out);
         
-        // Retrieve the actual neighbors from the graph communicator
-        MPI_Dist_graph_neighbors(mpi_comm_graph, 
-                                indegree_out, sources_out.data(), sourceweights_out.data(),
-                                outdegree_out, destinations_out.data(), destweights_out.data());
+    //     // Retrieve the actual neighbors from the graph communicator
+    //     MPI_Dist_graph_neighbors(mpi_comm_graph, 
+    //                             indegree_out, sources_out.data(), sourceweights_out.data(),
+    //                             outdegree_out, destinations_out.data(), destweights_out.data());
         
-        int rank = -1;
-        MPI_Comm_rank(mpi_comm_world, &rank);
+    //     int rank = -1;
+    //     MPI_Comm_rank(mpi_comm_world, &rank);
 
-        // Additional verification: Check if the queried values match our input
-        bool verification_passed = true;
+    //     // Additional verification: Check if the queried values match our input
+    //     bool verification_passed = true;
         
-        // Print verification information for each rank sequentially
-        for (int r = 0; r < world_size; ++r) {
-            MPI_Barrier(mpi_comm_world);
-            if (rank == r) {
-                std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
-                std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
-                for (int i = 0; i < indegree_out; ++i) {
-                    std::cout << sources_out[i] << " ";
-                }
-                std::cout << std::endl;
+    //     // Print verification information for each rank sequentially
+    //     for (int r = 0; r < world_size; ++r) {
+    //         MPI_Barrier(mpi_comm_world);
+    //         if (rank == r) {
+    //             std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
+    //             std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
+    //             for (int i = 0; i < indegree_out; ++i) {
+    //                 std::cout << sources_out[i] << " ";
+    //             }
+    //             std::cout << std::endl;
                 
-                std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
-                for (int i = 0; i < outdegree_out; ++i) {
-                    std::cout << destinations_out[i] << " ";
-                }
-                std::cout << std::endl;
+    //             std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
+    //             for (int i = 0; i < outdegree_out; ++i) {
+    //                 std::cout << destinations_out[i] << " ";
+    //             }
+    //             std::cout << std::endl;
                 
-                std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
-            }
-            MPI_Barrier(mpi_comm_world);
-        }
+    //             std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
+    //         }
+    //         MPI_Barrier(mpi_comm_world);
+    //     }
         
-        // Check if the counts match our stored values
-        if (indegree_out != num_recv_ranks) {
-            std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
-                      << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl;
-            verification_passed = false;
-        }
-        if (outdegree_out != num_send_ranks) {
-            std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
-                      << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl;
-            verification_passed = false;
-        }
+    //     // Check if the counts match our stored values
+    //     if (indegree_out != num_recv_ranks) {
+    //         std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
+    //                   << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl;
+    //         verification_passed = false;
+    //     }
+    //     if (outdegree_out != num_send_ranks) {
+    //         std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
+    //                   << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl;
+    //         verification_passed = false;
+    //     }
         
-        // Check if source ranks match (build set from our stored recv_rank_ids)
-        std::set<int> sources_set_in;
-        for (int i = 0; i < num_recv_ranks; ++i) {
-            sources_set_in.insert(recv_rank_ids.host(i));
-        }
-        std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
-        if (sources_set_in != sources_set_out) {
-            std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
-            verification_passed = false;
-        }
+    //     // Check if source ranks match (build set from our stored recv_rank_ids)
+    //     std::set<int> sources_set_in;
+    //     for (int i = 0; i < num_recv_ranks; ++i) {
+    //         sources_set_in.insert(recv_rank_ids.host(i));
+    //     }
+    //     std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
+    //     if (sources_set_in != sources_set_out) {
+    //         std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
+    //         verification_passed = false;
+    //     }
         
-        // Check if destination ranks match (build set from our stored send_rank_ids)
-        std::set<int> dests_set_in;
-        for (int i = 0; i < num_send_ranks; ++i) {
-            dests_set_in.insert(send_rank_ids.host(i));
-        }
-        std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
-        if (dests_set_in != dests_set_out) {
-            std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
-            verification_passed = false;
-        }
+    //     // Check if destination ranks match (build set from our stored send_rank_ids)
+    //     std::set<int> dests_set_in;
+    //     for (int i = 0; i < num_send_ranks; ++i) {
+    //         dests_set_in.insert(send_rank_ids.host(i));
+    //     }
+    //     std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
+    //     if (dests_set_in != dests_set_out) {
+    //         std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
+    //         verification_passed = false;
+    //     }
         
-        // Global verification check
-        int local_passed = verification_passed ? 1 : 0;
-        int global_passed = 0;
-        MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
-        MPI_Barrier(mpi_comm_world);
-        if (rank == 0) {
-            if (global_passed) {
-                std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
-            } else {
-                std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
-            }
-        }
-        MPI_Barrier(mpi_comm_world);
-    }
-
+    //     // Global verification check
+    //     int local_passed = verification_passed ? 1 : 0;
+    //     int global_passed = 0;
+    //     MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
+    //     MPI_Barrier(mpi_comm_world);
+    //     if (rank == 0) {
+    //         if (global_passed) {
+    //             std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
+    //         } else {
+    //             std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
+    //         }
+    //     }
+    //     MPI_Barrier(mpi_comm_world);
+    // }
 
     void setup_send_recv(DRaggedRightArrayKokkos<int> &rank_send_ids, DRaggedRightArrayKokkos<int> &rank_recv_ids){
 
-        this->send_indices_ = rank_send_ids; // ods of element data to send to each rank
-        this->recv_indices_ = rank_recv_ids; //
-
+        this->send_indices_ = rank_send_ids; // indices of element data to send to each rank
+        this->recv_indices_ = rank_recv_ids; // indices of element data to receive from each rank
 
         // Setup send data
         this->send_counts_ = DCArrayKokkos<int>(num_send_ranks, "send_counts");
@@ -280,9 +348,9 @@ using namespace mtr;
 
         MPI_Barrier(mpi_comm_world);
     }
+}; // End of CommunicationPlan
 
-};
-
-#endif // COMMUNICATION_PLAN_H
+#endif // end if HAVE_MPI
+#endif // end if COMMUNICATION_PLAN_H
 
 
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index dada0d99..f0e7ae4d 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -21,6 +21,29 @@
 #include "scotch.h"
 #include "ptscotch.h"
 
+/**
+ * @brief Partitions the input mesh into a naive element-based decomposition across MPI ranks.
+ *
+ * This function splits the input mesh (and its associated node information) evenly among the given number of MPI ranks.
+ * It assigns contiguous blocks of elements (and the corresponding nodes and nodal data) to each rank.
+ * 
+ * The function constructs:
+ * - The sub-mesh (naive_mesh) and its nodes (naive_node) for the local rank.
+ * - Maps and vectors indicating elements and nodes present on each rank.
+ * - Auxiliary arrays (elems_in_elem_on_rank, num_elems_in_elem_per_rank) for local element connectivity and neighbor look-ups.
+ *
+ * The decomposition is "naive" in that it uses a simple contiguous block assignment, without regard to mesh topology or quality of partitioning.
+ * This function is generally used as the preliminary step before repartitioning with tools like PT-Scotch or for algorithm prototyping.
+ *
+ * @param initial_mesh[in]         The input mesh containing all elements/nodes on rank 0.
+ * @param initial_node[in]         The nodal data for the input mesh on rank 0.
+ * @param naive_mesh[out]          The mesh on this rank after naive partitioning.
+ * @param naive_node[out]          The nodal data on this rank after naive partitioning.
+ * @param elems_in_elem_on_rank[out]   Vector of element-to-element connectivity for this rank's local mesh.
+ * @param num_elems_in_elem_per_rank[out] Vector of counts for element neighbors for each local element.
+ * @param world_size[in]           Number of MPI ranks (world size).
+ * @param rank[in]                 This MPI rank's id.
+ */
 
 void naive_partition_mesh(
     Mesh_t& initial_mesh,
@@ -530,6 +553,32 @@ void naive_partition_mesh(
 }
 
 
+/**
+ * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh.
+ *
+ * This function performs parallel mesh partitioning using a two-stage approach:
+ *   1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks).
+ *   2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity.
+ *
+ * The partitioned mesh, nodal data, and associated connectivity/gauss point information
+ * are distributed among MPI ranks as a result. The procedure ensures that each rank receives
+ * its assigned portion of the mesh and associated data in the final (target) decomposition.
+ *
+ * @param initial_mesh[in]  The input (global) mesh, present on rank 0 or all ranks at start.
+ * @param final_mesh[out]   The mesh assigned to this rank after PT-Scotch decomposition.
+ * @param initial_node[in]  Nodal data for the input (global) mesh; must match initial_mesh.
+ * @param final_node[out]   Nodal data for this rank after decomposition (corresponds to final_mesh).
+ * @param gauss_point[out]  Gauss point data structure, filled out for this rank's mesh.
+ * @param world_size[in]    Number of MPI ranks in use (the total number of partitions).
+ * @param rank[in]          This process's MPI rank ID.
+ *
+ * Internals:
+ * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition.
+ * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout.
+ * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information,
+ *   are managed and exchanged across ranks.
+ * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition.
+ */
 
 void partition_mesh(
     Mesh_t& initial_mesh,
@@ -557,13 +606,9 @@ void partition_mesh(
     std::vector<int> elems_in_elem_on_rank;
     std::vector<int> num_elems_in_elem_per_rank;
 
-    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
-
-
-// ****************************************************************************************** 
-//     Compute a repartition of the mesh using pt-scotch
-// ****************************************************************************************** 
 
+    // Perform the naive partitioning of the mesh
+    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
 
 
     /**********************************************************************************
@@ -1906,7 +1951,7 @@ void partition_mesh(
     element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data());
     MPI_Barrier(MPI_COMM_WORLD);
     // Optional: Verify the graph communicator was created successfully
-    if(print_info) element_communication_plan.verify_graph_communicator();
+    // if(print_info) element_communication_plan.verify_graph_communicator();
 
 
 // ****************************************************************************************** 
@@ -2036,7 +2081,7 @@ void partition_mesh(
 
 
     // --------------------------------------------------------------------------------------
-// Build the send pattern for nodes
+    // TODO: Build the send pattern for nodes --------------------------------------------------------------------------------------
     // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
     // Steps:
     // 1) Each rank contributes its ghost node GIDs.
diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
index 98f62313..be7984d5 100644
--- a/examples/mesh_decomp/mpi_type.h
+++ b/examples/mesh_decomp/mpi_type.h
@@ -1,12 +1,13 @@
 #ifndef MPICARRAYKOKKOS_H
 #define MPICARRAYKOKKOS_H
 
+// #ifdef HAVE_MPI
+#include <mpi.h>
 #include "matar.h"
 #include "communication_plan.h"
 
-using namespace mtr;
-
-// Add this before the MPICArrayKokkos class definition
+namespace mtr
+{
 
 // Type trait to map C++ types to MPI_Datatype
 template <typename T>
@@ -70,8 +71,7 @@ struct mpi_type_map<bool> {
 
 
 /////////////////////////
-// MPICArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
-// 
+// MPICArrayKokkos:  Type for managing distributed data on both CPU and GPU.
 /////////////////////////
 template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
 class MPICArrayKokkos {
@@ -322,40 +322,6 @@ class MPICArrayKokkos {
         this_array_.set_values(value);
     };
 
-
-    void reduce_sum(T& result){};
-
-
-    // // MPI send wrapper
-    // void send(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // // MPI recieve wrapper
-    // void recv(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // // MPI broadcast wrapper
-    // void broadcast(size_t count, int root, MPI_Comm comm);
-
-    // // MPI scatter wrapper
-    // void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
-
-    // // MPI gather wrapper
-    // void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
-
-    // // MPI allgather wrapper
-    // void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
-
-    // // MPI send wrapper
-    // void isend(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // // MPI recieve wrapper
-    // void irecv(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // // MPI wait wrapper for sender
-    // void wait_send();
-
-    // // MPI wait wrapper for receiver
-    // void wait_recv();
-
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
     ~MPICArrayKokkos ();
@@ -606,4 +572,8 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
 
 }
 
-#endif
\ No newline at end of file
+} // end namespace mtr
+
+
+// #endif // end if have MPI
+#endif // end if MPICARRAYKOKKOS_H
\ No newline at end of file
diff --git a/src/include/communication_plan.h b/src/include/communication_plan_old.h
similarity index 100%
rename from src/include/communication_plan.h
rename to src/include/communication_plan_old.h
diff --git a/src/include/mapped_mpi_types.h b/src/include/mapped_mpi_types.h
index 6d5d18d3..3c0ca4d0 100644
--- a/src/include/mapped_mpi_types.h
+++ b/src/include/mapped_mpi_types.h
@@ -45,7 +45,7 @@
 #include <mpi.h>
 #include <mpi_types.h>
 #include "partition_map.h"
-#include "communication_plan.h"
+// #include "communication_plan.h"
 
 namespace mtr
 {

From 70a30cfe184340106c15af1b2f3b6ab965a5c8b9 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 10 Nov 2025 16:31:18 -0600
Subject: [PATCH 29/52] ENH: Working through build with examples

---
 examples/mesh_decomp/state.h                  |   2 +-
 .../include}/communication_plan.h             |   0
 src/include/mapped_mpi_types.h                |   1 -
 src/include/mpi_types.h                       | 897 +++++++-----------
 src/include/mpi_types_old.h                   | 784 +++++++++++++++
 5 files changed, 1131 insertions(+), 553 deletions(-)
 rename {examples/mesh_decomp => src/include}/communication_plan.h (100%)
 create mode 100644 src/include/mpi_types_old.h

diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 2ed970d5..0da00095 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -35,7 +35,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define STATE_H
 
 #include "matar.h"
-#include "mpi_type.h"
+// #include "mpi_type.h"
 
 using namespace mtr;
 
diff --git a/examples/mesh_decomp/communication_plan.h b/src/include/communication_plan.h
similarity index 100%
rename from examples/mesh_decomp/communication_plan.h
rename to src/include/communication_plan.h
diff --git a/src/include/mapped_mpi_types.h b/src/include/mapped_mpi_types.h
index 3c0ca4d0..ed690ca6 100644
--- a/src/include/mapped_mpi_types.h
+++ b/src/include/mapped_mpi_types.h
@@ -45,7 +45,6 @@
 #include <mpi.h>
 #include <mpi_types.h>
 #include "partition_map.h"
-// #include "communication_plan.h"
 
 namespace mtr
 {
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index b10a57fc..ac651551 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -1,120 +1,149 @@
-#ifndef MPI_TYPES_H
-#define MPI_TYPES_H
-/**********************************************************************************************
- © 2020. Triad National Security, LLC. All rights reserved.
- This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
- National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
- Department of Energy/National Nuclear Security Administration. All rights in the program are
- reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
- Security Administration. The Government is granted for itself and others acting on its behalf a
- nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
- derivative works, distribute copies to the public, perform publicly and display publicly, and
- to permit others to do so.
- This program is open source under the BSD-3 License.
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
- 
- 1.  Redistributions of source code must retain the above copyright notice, this list of
- conditions and the following disclaimer.
- 
- 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
- conditions and the following disclaimer in the documentation and/or other materials
- provided with the distribution.
- 
- 3.  Neither the name of the copyright holder nor the names of its contributors may be used
- to endorse or promote products derived from this software without specific prior
- written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- **********************************************************************************************/
-
-#include "host_types.h"
-#include "kokkos_types.h"
-#include <typeinfo>
+#ifndef MPICARRAYKOKKOS_H
+#define MPICARRAYKOKKOS_H
+
 #ifdef HAVE_MPI
 #include <mpi.h>
+#include "matar.h"
+#include "communication_plan.h"
 
 namespace mtr
 {
 
+// Type trait to map C++ types to MPI_Datatype
+template <typename T>
+struct mpi_type_map {
+    static MPI_Datatype value() {
+        static_assert(sizeof(T) == 0, "Unsupported type for MPI communication");
+        return MPI_DATATYPE_NULL;
+    }
+};
+
+// Specializations for common types
+template <>
+struct mpi_type_map<int> {
+    static MPI_Datatype value() { return MPI_INT; }
+};
+
+template <>
+struct mpi_type_map<long> {
+    static MPI_Datatype value() { return MPI_LONG; }
+};
+
+template <>
+struct mpi_type_map<long long> {
+    static MPI_Datatype value() { return MPI_LONG_LONG; }
+};
+
+template <>
+struct mpi_type_map<unsigned int> {
+    static MPI_Datatype value() { return MPI_UNSIGNED; }
+};
+
+template <>
+struct mpi_type_map<unsigned long> {
+    static MPI_Datatype value() { return MPI_UNSIGNED_LONG; }
+};
+
+template <>
+struct mpi_type_map<float> {
+    static MPI_Datatype value() { return MPI_FLOAT; }
+};
+
+template <>
+struct mpi_type_map<double> {
+    static MPI_Datatype value() { return MPI_DOUBLE; }
+};
+
+template <>
+struct mpi_type_map<char> {
+    static MPI_Datatype value() { return MPI_CHAR; }
+};
+
+template <>
+struct mpi_type_map<unsigned char> {
+    static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; }
+};
+
+template <>
+struct mpi_type_map<bool> {
+    static MPI_Datatype value() { return MPI_C_BOOL; }
+};
+
+
 /////////////////////////
-// MPIArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+// MPICArrayKokkos:  Type for managing distributed data on both CPU and GPU.
 /////////////////////////
 template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class MPIArrayKokkos {
+class MPICArrayKokkos {
 
-    // this is manage
-    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    // Dual view for managing data on both CPU and GPU
+    DCArrayKokkos<T> this_array_;
+
+    DCArrayKokkos<T> send_buffer_;
+    DCArrayKokkos<T> recv_buffer_;
     
 protected:
     size_t dims_[7];
     size_t length_;
     size_t order_;  // tensor order (rank)
-    int mpi_recv_rank_;
-    int mpi_tag_;
+
     MPI_Comm mpi_comm_;
     MPI_Status mpi_status_;
     MPI_Datatype mpi_datatype_;
     MPI_Request mpi_request_;
-    TArray1D this_array_;
+
     
-    void set_mpi_type();
+    // --- Ghost Communication Support ---
+    CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
+
+
+    DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
+    DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
+    DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
+    DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
+
+    size_t stride_; // [size: num_dims] Number of contiguous values per first index element
+
+
+    DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank
+    DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank
+    
+    
+    size_t num_owned_;            // Number of owned items (nodes/elements)
+    size_t num_ghost_;            // Number of ghost items (nodes/elements)
 
 public:
     // Data member to access host view
     ViewCArray <T> host;
 
-    MPIArrayKokkos();
+
+    // Note, consider this for sending blocks without dealing with stride_
+    // MPI_Datatype vector_type;
+    // MPI_Type_contiguous(stride_, mpi_type_map<T>::value(), &vector_type);
+    // MPI_Type_commit(&vector_type);
+
+    MPICArrayKokkos();
     
-    MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
                  size_t dim3, size_t dim4, size_t dim5,
                  size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
     
-    // These functions can setup the data needed for halo send/receives
-    // Not necessary for standard MPI comms
-    void mpi_setup();
-
-    void mpi_setup(int recv_rank);
-
-    void mpi_setup(int recv_rank, int tag);
-
-    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
-
-    void mpi_set_rank(int recv_rank);
-
-    void mpi_set_tag(int tag);
-
-    void mpi_set_comm(MPI_Comm comm);
-
-    int get_rank();
-
-    int get_tag();
 
-    MPI_Comm get_comm();
 
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i) const;
@@ -140,7 +169,48 @@ class MPIArrayKokkos {
                   size_t n, size_t o) const;
     
     KOKKOS_INLINE_FUNCTION
-    MPIArrayKokkos& operator=(const MPIArrayKokkos& temp);
+    MPICArrayKokkos& operator=(const MPICArrayKokkos& temp);
+
+
+    // Method to set comm plan for halo communication
+    void initialize_comm_plan(CommunicationPlan& comm_plan){
+        comm_plan_ = &comm_plan;
+        
+        size_t send_size = comm_plan_->total_send_count * stride_;
+        size_t recv_size = comm_plan_->total_recv_count * stride_;
+        
+        if (send_size > 0) {
+            send_buffer_ = DCArrayKokkos<T>(send_size, "send_buffer");
+        }
+        if (recv_size > 0) {
+            recv_buffer_ = DCArrayKokkos<T>(recv_size, "recv_buffer");
+        }
+
+        if (comm_plan_->num_send_ranks > 0) {
+            send_counts_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_counts");
+            send_displs_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_displs");
+            
+            for(int i = 0; i < comm_plan_->num_send_ranks; i++){
+                send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_;
+                send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_;
+            }
+            send_counts_.update_device();
+            send_displs_.update_device();
+        }
+        
+        if (comm_plan_->num_recv_ranks > 0) {
+            recv_counts_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_counts");
+            recv_displs_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_displs");
+            
+            for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
+                recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_;
+                recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_;
+            }
+            recv_counts_.update_device();
+            recv_displs_.update_device();
+        }
+    };
+
 
     // GPU Method
     // Method that returns size
@@ -168,7 +238,7 @@ class MPIArrayKokkos {
 
     // Method returns kokkos dual view
     KOKKOS_INLINE_FUNCTION
-    TArray1D get_kokkos_dual_view() const;
+    Kokkos::DualView<T*, Layout, ExecSpace, MemoryTraits> get_kokkos_dual_view() const;
 
     // Method that update host view
     void update_host();
@@ -176,167 +246,170 @@ class MPIArrayKokkos {
     // Method that update device view
     void update_device();
 
-    // MPI send wrapper
-    void send(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI recieve wrapper
-    void recv(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI broadcast wrapper
-    void broadcast(size_t count, int root, MPI_Comm comm);
-
-    // MPI scatter wrapper
-    void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
-
-    // MPI gather wrapper
-    void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
-
-    // MPI allgather wrapper
-    void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
-
-    // MPI send wrapper
-    void isend(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI recieve wrapper
-    void irecv(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI wait wrapper for sender
-    void wait_send();
-
-    // MPI wait wrapper for receiver
-    void wait_recv();
-
-    // MPI barrier wrapper
-    //void barrier(MPI_Comm comm);
-
-    // MPI send wrapper
-    void halo_send();
-
-    // MPI recieve wrapper
-    void halo_recv();
-
-    // MPI send wrapper
-    void halo_isend();
-
-    // MPI recieve wrapper
-    void halo_irecv();
+    // Method that builds the send buffer, note, this has to be ordered
+    // Such that all the boundary elements going to a given rank are contiguous in the send buffer.
+    void fill_send_buffer(){
+
+        size_t send_idx = 0;
+        for(int i = 0; i < comm_plan_->num_send_ranks; i++){
+            for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
+                size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send
+                
+                // Copy all values associated with this element (handles multi-dimensional arrays)
+                for(size_t k = 0; k < stride_; k++){
+                    send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k];
+                }
+                send_idx += stride_;
+            }
+        }
+    };
+
+    // Method that copies the recv buffer into the this_array
+    void copy_recv_buffer(){
+
+        size_t recv_idx = 0;
+        for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
+            for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){
+                size_t dest_idx = comm_plan_->recv_indices_.host(i, j);
+                
+                // Copy all values associated with this element (handles multi-dimensional arrays)
+                for(size_t k = 0; k < stride_; k++){
+                    this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k);
+                }
+                
+                recv_idx += stride_;
+            }
+        }
+        this_array_.update_device();
+    };
+
+
+    // Note: This "may" be needed, im not sure.  Currently, it works....
+        // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior)
+        // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr;
+        // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr;
+        // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr;
+        // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr;
+        // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr;
+        // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
+
+    // Method that communicates the data between the ranks
+    // NOTE: This is a blocking communication operation, 
+    // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv
+    void communicate(){
+
+        this_array_.update_host();
+       
+        fill_send_buffer();
+        
+        MPI_Neighbor_alltoallv(
+            send_buffer_.host_pointer(),
+            send_counts_.host_pointer(),
+            send_displs_.host_pointer(),
+            mpi_type_map<T>::value(),  // MPI_TYPE
+            recv_buffer_.host_pointer(),
+            recv_counts_.host_pointer(),
+            recv_displs_.host_pointer(), 
+            mpi_type_map<T>::value(),  // MPI_TYPE
+            comm_plan_->mpi_comm_graph);
+        
+        copy_recv_buffer();
+
+        this_array_.update_device();
+    };
+
+    void set_values(const T& value){
+        this_array_.set_values(value);
+    };
 
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
-    ~MPIArrayKokkos ();
-}; // End of MPIArrayKokkos
-
+    ~MPICArrayKokkos ();
+}; // End of MPIDArrayKokkos
 
 // Default constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos() {
-    length_ = order_ = 0;
-    for (int i = 0; i < 7; i++) {
-        dims_[i] = 0;
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
+    : this_array_(), stride_(1), length_(0), order_(0) {
+        for (int i = 0; i < 7; i++) {
+            dims_[i] = 0;
+        }
     }
-}
 
 // Overloaded 1D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) 
+    : stride_(1), length_(dim0), order_(1) {
     dims_[0] = dim0;
-    order_ = 1;
-    length_ = dim0;
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0);
 }
 
 // Overloaded 2D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) 
+    : stride_(dim1), length_(dim0 * dim1), order_(2) {
     dims_[0] = dim0;
     dims_[1] = dim1;
-    order_ = 2;
-    length_ = (dim0 * dim1);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1);
-    set_mpi_type();
+
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1);
 }
 
+// Overloaded 3D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) 
+    : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
-    order_ = 3;
-    length_ = (dim0 * dim1 * dim2);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2);
 }
 
+// Overloaded 4D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
     dims_[3] = dim3;
-    order_ = 4;
-    length_ = (dim0 * dim1 * dim2 * dim3);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3);
 }
 
+// Overloaded 5D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3,
-                              size_t dim4, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
     dims_[3] = dim3;
     dims_[4] = dim4;
-    order_ = 5;
-    length_ = (dim0 * dim1 * dim2 * dim3 * dim4);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4);
 }
 
+// Overloaded 6D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3,
-                              size_t dim4, size_t dim5, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
     dims_[3] = dim3;
     dims_[4] = dim4;
     dims_[5] = dim5;
-    order_ = 6;
-    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5);
 }
 
+// Overloaded 7D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3,
-                              size_t dim4, size_t dim5,
-                              size_t dim6, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
@@ -344,441 +417,163 @@ MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, siz
     dims_[4] = dim4;
     dims_[5] = dim5;
     dims_[6] = dim6;
-    order_ = 7;
-    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
-    if (typeid(T).name() == typeid(bool).name()) {
-        mpi_datatype_ = MPI_C_BOOL;
-    }
-    else if (typeid(T).name() == typeid(int).name()) {
-        mpi_datatype_ = MPI_INT;
-    }
-    else if (typeid(T).name() == typeid(long int).name()) {
-        mpi_datatype_ = MPI_LONG;
-    }
-    else if (typeid(T).name() == typeid(long long int).name()) {
-        mpi_datatype_ = MPI_LONG_LONG_INT;
-    }
-    else if (typeid(T).name() == typeid(float).name()) {
-        mpi_datatype_ = MPI_FLOAT;
-    }
-    else if (typeid(T).name() == typeid(double).name()) {
-        mpi_datatype_ = MPI_DOUBLE;
-    }
-    else {
-        printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n");
-        mpi_datatype_ = MPI_INT;
-    }
-}
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
-    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!");
-    return this_array_.d_view(i);
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!");
+    return this_array_(i);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
-    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!");
-    return this_array_.d_view(j + (i * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!");
+    return this_array_(i, j);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
-    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!");
-    return this_array_.d_view(k + (j * dims_[2])
-                                + (i * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!");
+    return this_array_(i, j, k);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
-    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!");
-    return this_array_.d_view(l + (k * dims_[3])
-                                + (j * dims_[3] * dims_[2])
-                                + (i * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!");
+    return this_array_(i, j, k, l);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
-                               size_t m) const {
-    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!");
-    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!");
-    return this_array_.d_view(m + (l * dims_[4])
-                                + (k * dims_[4] * dims_[3])
-                                + (j * dims_[4] * dims_[3] * dims_[2])
-                                + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!");
+    return this_array_(i, j, k, l, m);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
-                               size_t m, size_t n) const {
-    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!");
-    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!");
-    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!");
-    return this_array_.d_view(n + (m * dims_[5])
-                                + (l * dims_[5] * dims_[4])
-                                + (k * dims_[5] * dims_[4] * dims_[3])
-                                + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2])
-                                + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!");
+    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!");
+    return this_array_(i, j, k, l, m, n);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
-                               size_t m, size_t n, size_t o) const {
-    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!");
-    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!");
-    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!");
-    assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!");
-    return this_array_.d_view(o + (n * dims_[6])
-                                + (m * dims_[6] * dims_[5])
-                                + (l * dims_[6] * dims_[5] * dims_[4])
-                                + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3])
-                                + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2])
-                                + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!");
+    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!");
+    assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!");
+    return this_array_(i, j, k, l, m, n, o);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator= (const MPIArrayKokkos& temp) {
-    
-    // Do nothing if the assignment is of the form x = x
-    if (this != &temp) {
-        for (int iter = 0; iter < temp.order_; iter++){
-            dims_[iter] = temp.dims_[iter];
-        } // end for
-
-        order_ = temp.order_;
-        length_ = temp.length_;
-        this_array_ = temp.this_array_;
-        host = temp.host;
-        mpi_recv_rank_ = temp.mpi_recv_rank_;
-        mpi_tag_ = temp.mpi_tag_;
-        mpi_comm_ = temp.mpi_comm_;
-        mpi_status_ = temp.mpi_status_;
-        mpi_datatype_ = temp.mpi_datatype_;
-        mpi_request_ = temp.mpi_request_;
-    }
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPICArrayKokkos& temp) {
+    this_array_ = temp.this_array_;
+    host = temp.host;  // Also copy the host ViewCArray
+    comm_plan_ = temp.comm_plan_;
+    send_buffer_ = temp.send_buffer_;
+    recv_buffer_ = temp.recv_buffer_;
+    stride_ = temp.stride_;
     return *this;
 }
 
 // Return size
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
-    return length_;
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return this_array_.size();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
-    return length_;
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return this_array_.extent();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
-    assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
-    assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!");
-    return dims_[i];
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
+    assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!");
+    return this_array_.dims(i);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
-    return order_;
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
+    return this_array_.order();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
-    return this_array_.d_view.data();
+T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.device_pointer();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
-    return this_array_.h_view.data();
+T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+    return this_array_.host_pointer();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
-  return this_array_;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
-
-    this_array_.template modify<typename TArray1D::execution_space>();
-    this_array_.template sync<typename TArray1D::host_mirror_space>();
+Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+    return this_array_.get_kokkos_dual_view();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
-
-    this_array_.template modify<typename TArray1D::host_mirror_space>();
-    this_array_.template sync<typename TArray1D::execution_space>();
+void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+    this_array_.update_host();
 }
 
-// a default setup, should not be used except for testing
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup() {
-    mpi_recv_rank_ = 1;
-    mpi_tag_ = 99;
-    mpi_comm_ = MPI_COMM_WORLD;
+void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+    this_array_.update_device();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank) {
-    mpi_recv_rank_ = recv_rank;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag) {
-    mpi_recv_rank_ = recv_rank;
-    mpi_tag_ = tag;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag, MPI_Comm comm) {
-    mpi_recv_rank_ = recv_rank;
-    mpi_tag_ = tag;
-    mpi_comm_ = comm;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_rank(int recv_rank) {
-    mpi_recv_rank_ = recv_rank;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_tag(int tag) {
-    mpi_tag_ = tag;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_comm(MPI_Comm comm) {
-    mpi_comm_ = comm;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_rank() {
-    return mpi_recv_rank_;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_tag() {
-    return mpi_tag_;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPI_Comm MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_comm() {
-    return mpi_comm_;
-}
-
-//MPI_Send wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::send(size_t count, int dest, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); 
-#else
-    update_host();
-    MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); 
-#endif
-}
-
-//MPI_Recv wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::recv(size_t count, int source, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
-#else
-    MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
-    update_device();
-#endif
-}
-
-//MPI_Send halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_send() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
-#else
-    update_host();
-    MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
-#endif
-}
-
-//MPI_Recv halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_recv() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
-#else
-    MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
-    update_device();
-#endif
-}
-
-//MPI_iSend halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_isend() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#else
-    update_host();
-    MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#endif
-}
-
-//MPI_iRecv halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_irecv() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#else
-    MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#endif
-}
-
-//MPI_Bcast wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::broadcast(size_t count, int root, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); 
-#else
-    update_host();
-    MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); 
-    update_device();
-#endif
-}
-
-//MPI_Scatter wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
-#else
-    update_host();
-    MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
-    recv_buffer.update_device();
-#endif
-}
-
-//MPI_Gather wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
-#else
-    update_host();
-    MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
-    recv_buffer.update_device();
-#endif
-}
-
-//MPI_AllGather wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); 
-#else
-    update_host();
-    MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); 
-    recv_buffer.update_device();
-#endif
-}
-
-//MPI_Isend wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::isend(size_t count, int dest, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
-#else
-    update_host();
-    MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
-#endif
-}
-
-//MPI_Irecv wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::irecv(size_t count, int source, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
-#else
-    MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
-#endif
-}
-
-//MPI_Wait wrapper for the sender
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_send() {
-    MPI_Wait(&mpi_request_, &mpi_status_); 
-}
+KOKKOS_INLINE_FUNCTION
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
 
-//MPI_Wait wrapper for the receiver
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_recv() {
-    MPI_Wait(&mpi_request_, &mpi_status_); 
-#ifndef HAVE_GPU_AWARE_MPI
-    update_device();
-#endif
 }
 
-//MPI_Barrier wrapper
-//template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-//void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
-//    MPI_Barrier(comm); 
-//}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPIArrayKokkos() {}
-
-////////////////////////////////////////////////////////////////////////////////
-// End of MPIArrayKokkos
-////////////////////////////////////////////////////////////////////////////////
+} // end namespace mtr
 
-} // end namespace
 
 #endif // end if have MPI
-
-#endif // MPI_TYPES_H
-
+#endif // end if MPICARRAYKOKKOS_H
\ No newline at end of file
diff --git a/src/include/mpi_types_old.h b/src/include/mpi_types_old.h
new file mode 100644
index 00000000..b10a57fc
--- /dev/null
+++ b/src/include/mpi_types_old.h
@@ -0,0 +1,784 @@
+#ifndef MPI_TYPES_H
+#define MPI_TYPES_H
+/**********************************************************************************************
+ © 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+
+#include "host_types.h"
+#include "kokkos_types.h"
+#include <typeinfo>
+#ifdef HAVE_MPI
+#include <mpi.h>
+
+namespace mtr
+{
+
+/////////////////////////
+// MPIArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+/////////////////////////
+template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+class MPIArrayKokkos {
+
+    // this is manage
+    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    
+protected:
+    size_t dims_[7];
+    size_t length_;
+    size_t order_;  // tensor order (rank)
+    int mpi_recv_rank_;
+    int mpi_tag_;
+    MPI_Comm mpi_comm_;
+    MPI_Status mpi_status_;
+    MPI_Datatype mpi_datatype_;
+    MPI_Request mpi_request_;
+    TArray1D this_array_;
+    
+    void set_mpi_type();
+
+public:
+    // Data member to access host view
+    ViewCArray <T> host;
+
+    MPIArrayKokkos();
+    
+    MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5,
+                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    // These functions can setup the data needed for halo send/receives
+    // Not necessary for standard MPI comms
+    void mpi_setup();
+
+    void mpi_setup(int recv_rank);
+
+    void mpi_setup(int recv_rank, int tag);
+
+    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
+
+    void mpi_set_rank(int recv_rank);
+
+    void mpi_set_tag(int tag);
+
+    void mpi_set_comm(MPI_Comm comm);
+
+    int get_rank();
+
+    int get_tag();
+
+    MPI_Comm get_comm();
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n, size_t o) const;
+    
+    KOKKOS_INLINE_FUNCTION
+    MPIArrayKokkos& operator=(const MPIArrayKokkos& temp);
+
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
+
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t dims(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t order() const;
+ 
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
+
+    // Method returns the raw host pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* host_pointer() const;
+
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    TArray1D get_kokkos_dual_view() const;
+
+    // Method that update host view
+    void update_host();
+
+    // Method that update device view
+    void update_device();
+
+    // MPI send wrapper
+    void send(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI recieve wrapper
+    void recv(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI broadcast wrapper
+    void broadcast(size_t count, int root, MPI_Comm comm);
+
+    // MPI scatter wrapper
+    void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+
+    // MPI gather wrapper
+    void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+
+    // MPI allgather wrapper
+    void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
+
+    // MPI send wrapper
+    void isend(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI recieve wrapper
+    void irecv(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI wait wrapper for sender
+    void wait_send();
+
+    // MPI wait wrapper for receiver
+    void wait_recv();
+
+    // MPI barrier wrapper
+    //void barrier(MPI_Comm comm);
+
+    // MPI send wrapper
+    void halo_send();
+
+    // MPI recieve wrapper
+    void halo_recv();
+
+    // MPI send wrapper
+    void halo_isend();
+
+    // MPI recieve wrapper
+    void halo_irecv();
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~MPIArrayKokkos ();
+}; // End of MPIArrayKokkos
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos() {
+    length_ = order_ = 0;
+    for (int i = 0; i < 7; i++) {
+        dims_[i] = 0;
+    }
+}
+
+// Overloaded 1D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    order_ = 1;
+    length_ = dim0;
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0);
+    set_mpi_type();
+}
+
+// Overloaded 2D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    order_ = 2;
+    length_ = (dim0 * dim1);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    order_ = 3;
+    length_ = (dim0 * dim1 * dim2);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    order_ = 4;
+    length_ = (dim0 * dim1 * dim2 * dim3);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3,
+                              size_t dim4, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    order_ = 5;
+    length_ = (dim0 * dim1 * dim2 * dim3 * dim4);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3,
+                              size_t dim4, size_t dim5, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    order_ = 6;
+    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3,
+                              size_t dim4, size_t dim5,
+                              size_t dim6, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    dims_[6] = dim6;
+    order_ = 7;
+    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
+    if (typeid(T).name() == typeid(bool).name()) {
+        mpi_datatype_ = MPI_C_BOOL;
+    }
+    else if (typeid(T).name() == typeid(int).name()) {
+        mpi_datatype_ = MPI_INT;
+    }
+    else if (typeid(T).name() == typeid(long int).name()) {
+        mpi_datatype_ = MPI_LONG;
+    }
+    else if (typeid(T).name() == typeid(long long int).name()) {
+        mpi_datatype_ = MPI_LONG_LONG_INT;
+    }
+    else if (typeid(T).name() == typeid(float).name()) {
+        mpi_datatype_ = MPI_FLOAT;
+    }
+    else if (typeid(T).name() == typeid(double).name()) {
+        mpi_datatype_ = MPI_DOUBLE;
+    }
+    else {
+        printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n");
+        mpi_datatype_ = MPI_INT;
+    }
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!");
+    return this_array_.d_view(i);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!");
+    return this_array_.d_view(j + (i * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!");
+    return this_array_.d_view(k + (j * dims_[2])
+                                + (i * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!");
+    return this_array_.d_view(l + (k * dims_[3])
+                                + (j * dims_[3] * dims_[2])
+                                + (i * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!");
+    return this_array_.d_view(m + (l * dims_[4])
+                                + (k * dims_[4] * dims_[3])
+                                + (j * dims_[4] * dims_[3] * dims_[2])
+                                + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!");
+    return this_array_.d_view(n + (m * dims_[5])
+                                + (l * dims_[5] * dims_[4])
+                                + (k * dims_[5] * dims_[4] * dims_[3])
+                                + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2])
+                                + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!");
+    assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!");
+    return this_array_.d_view(o + (n * dims_[6])
+                                + (m * dims_[6] * dims_[5])
+                                + (l * dims_[6] * dims_[5] * dims_[4])
+                                + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3])
+                                + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2])
+                                + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator= (const MPIArrayKokkos& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        for (int iter = 0; iter < temp.order_; iter++){
+            dims_[iter] = temp.dims_[iter];
+        } // end for
+
+        order_ = temp.order_;
+        length_ = temp.length_;
+        this_array_ = temp.this_array_;
+        host = temp.host;
+        mpi_recv_rank_ = temp.mpi_recv_rank_;
+        mpi_tag_ = temp.mpi_tag_;
+        mpi_comm_ = temp.mpi_comm_;
+        mpi_status_ = temp.mpi_status_;
+        mpi_datatype_ = temp.mpi_datatype_;
+        mpi_request_ = temp.mpi_request_;
+    }
+    
+    return *this;
+}
+
+// Return size
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
+    assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!");
+    return dims_[i];
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
+    return order_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.d_view.data();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+    return this_array_.h_view.data();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+  return this_array_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+
+    this_array_.template modify<typename TArray1D::execution_space>();
+    this_array_.template sync<typename TArray1D::host_mirror_space>();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+
+    this_array_.template modify<typename TArray1D::host_mirror_space>();
+    this_array_.template sync<typename TArray1D::execution_space>();
+}
+
+// a default setup, should not be used except for testing
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup() {
+    mpi_recv_rank_ = 1;
+    mpi_tag_ = 99;
+    mpi_comm_ = MPI_COMM_WORLD;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank) {
+    mpi_recv_rank_ = recv_rank;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag) {
+    mpi_recv_rank_ = recv_rank;
+    mpi_tag_ = tag;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag, MPI_Comm comm) {
+    mpi_recv_rank_ = recv_rank;
+    mpi_tag_ = tag;
+    mpi_comm_ = comm;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_rank(int recv_rank) {
+    mpi_recv_rank_ = recv_rank;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_tag(int tag) {
+    mpi_tag_ = tag;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_comm(MPI_Comm comm) {
+    mpi_comm_ = comm;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_rank() {
+    return mpi_recv_rank_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_tag() {
+    return mpi_tag_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPI_Comm MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_comm() {
+    return mpi_comm_;
+}
+
+//MPI_Send wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::send(size_t count, int dest, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); 
+#else
+    update_host();
+    MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); 
+#endif
+}
+
+//MPI_Recv wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::recv(size_t count, int source, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
+#else
+    MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
+    update_device();
+#endif
+}
+
+//MPI_Send halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_send() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
+#else
+    update_host();
+    MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
+#endif
+}
+
+//MPI_Recv halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_recv() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
+#else
+    MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
+    update_device();
+#endif
+}
+
+//MPI_iSend halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_isend() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#else
+    update_host();
+    MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#endif
+}
+
+//MPI_iRecv halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_irecv() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#else
+    MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#endif
+}
+
+//MPI_Bcast wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::broadcast(size_t count, int root, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); 
+#else
+    update_host();
+    MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); 
+    update_device();
+#endif
+}
+
+//MPI_Scatter wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
+#else
+    update_host();
+    MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
+    recv_buffer.update_device();
+#endif
+}
+
+//MPI_Gather wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
+#else
+    update_host();
+    MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
+    recv_buffer.update_device();
+#endif
+}
+
+//MPI_AllGather wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); 
+#else
+    update_host();
+    MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); 
+    recv_buffer.update_device();
+#endif
+}
+
+//MPI_Isend wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::isend(size_t count, int dest, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
+#else
+    update_host();
+    MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
+#endif
+}
+
+//MPI_Irecv wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::irecv(size_t count, int source, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
+#else
+    MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
+#endif
+}
+
+//MPI_Wait wrapper for the sender
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_send() {
+    MPI_Wait(&mpi_request_, &mpi_status_); 
+}
+
+//MPI_Wait wrapper for the receiver
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_recv() {
+    MPI_Wait(&mpi_request_, &mpi_status_); 
+#ifndef HAVE_GPU_AWARE_MPI
+    update_device();
+#endif
+}
+
+//MPI_Barrier wrapper
+//template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+//void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
+//    MPI_Barrier(comm); 
+//}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPIArrayKokkos() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of MPIArrayKokkos
+////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace
+
+#endif // end if have MPI
+
+#endif // MPI_TYPES_H
+

From 0cd362080001f2448e48f259e7451c17f739aa27 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 10 Nov 2025 16:35:13 -0600
Subject: [PATCH 30/52] ENH: Moving mpi type to base MATAR

---
 examples/mesh_decomp/mpi_type.h | 579 --------------------------------
 1 file changed, 579 deletions(-)
 delete mode 100644 examples/mesh_decomp/mpi_type.h

diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h
deleted file mode 100644
index be7984d5..00000000
--- a/examples/mesh_decomp/mpi_type.h
+++ /dev/null
@@ -1,579 +0,0 @@
-#ifndef MPICARRAYKOKKOS_H
-#define MPICARRAYKOKKOS_H
-
-// #ifdef HAVE_MPI
-#include <mpi.h>
-#include "matar.h"
-#include "communication_plan.h"
-
-namespace mtr
-{
-
-// Type trait to map C++ types to MPI_Datatype
-template <typename T>
-struct mpi_type_map {
-    static MPI_Datatype value() {
-        static_assert(sizeof(T) == 0, "Unsupported type for MPI communication");
-        return MPI_DATATYPE_NULL;
-    }
-};
-
-// Specializations for common types
-template <>
-struct mpi_type_map<int> {
-    static MPI_Datatype value() { return MPI_INT; }
-};
-
-template <>
-struct mpi_type_map<long> {
-    static MPI_Datatype value() { return MPI_LONG; }
-};
-
-template <>
-struct mpi_type_map<long long> {
-    static MPI_Datatype value() { return MPI_LONG_LONG; }
-};
-
-template <>
-struct mpi_type_map<unsigned int> {
-    static MPI_Datatype value() { return MPI_UNSIGNED; }
-};
-
-template <>
-struct mpi_type_map<unsigned long> {
-    static MPI_Datatype value() { return MPI_UNSIGNED_LONG; }
-};
-
-template <>
-struct mpi_type_map<float> {
-    static MPI_Datatype value() { return MPI_FLOAT; }
-};
-
-template <>
-struct mpi_type_map<double> {
-    static MPI_Datatype value() { return MPI_DOUBLE; }
-};
-
-template <>
-struct mpi_type_map<char> {
-    static MPI_Datatype value() { return MPI_CHAR; }
-};
-
-template <>
-struct mpi_type_map<unsigned char> {
-    static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; }
-};
-
-template <>
-struct mpi_type_map<bool> {
-    static MPI_Datatype value() { return MPI_C_BOOL; }
-};
-
-
-/////////////////////////
-// MPICArrayKokkos:  Type for managing distributed data on both CPU and GPU.
-/////////////////////////
-template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class MPICArrayKokkos {
-
-    // Dual view for managing data on both CPU and GPU
-    DCArrayKokkos<T> this_array_;
-
-    DCArrayKokkos<T> send_buffer_;
-    DCArrayKokkos<T> recv_buffer_;
-    
-protected:
-    size_t dims_[7];
-    size_t length_;
-    size_t order_;  // tensor order (rank)
-
-    MPI_Comm mpi_comm_;
-    MPI_Status mpi_status_;
-    MPI_Datatype mpi_datatype_;
-    MPI_Request mpi_request_;
-
-    
-    // --- Ghost Communication Support ---
-    CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
-
-
-    DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
-    DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
-    DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
-    DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
-
-    size_t stride_; // [size: num_dims] Number of contiguous values per first index element
-
-
-    DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank
-    DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank
-    
-    
-    size_t num_owned_;            // Number of owned items (nodes/elements)
-    size_t num_ghost_;            // Number of ghost items (nodes/elements)
-
-public:
-    // Data member to access host view
-    ViewCArray <T> host;
-
-
-    // Note, consider this for sending blocks without dealing with stride_
-    // MPI_Datatype vector_type;
-    // MPI_Type_contiguous(stride_, mpi_type_map<T>::value(), &vector_type);
-    // MPI_Type_commit(&vector_type);
-
-    MPICArrayKokkos();
-    
-    MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, size_t dim5,
-                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
-    
-
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i) const;
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j) const;
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k) const;
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l) const;
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
-                  size_t n) const;
-
-    KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
-                  size_t n, size_t o) const;
-    
-    KOKKOS_INLINE_FUNCTION
-    MPICArrayKokkos& operator=(const MPICArrayKokkos& temp);
-
-
-    // Method to set comm plan for halo communication
-    void initialize_comm_plan(CommunicationPlan& comm_plan){
-        comm_plan_ = &comm_plan;
-        
-        size_t send_size = comm_plan_->total_send_count * stride_;
-        size_t recv_size = comm_plan_->total_recv_count * stride_;
-        
-        if (send_size > 0) {
-            send_buffer_ = DCArrayKokkos<T>(send_size, "send_buffer");
-        }
-        if (recv_size > 0) {
-            recv_buffer_ = DCArrayKokkos<T>(recv_size, "recv_buffer");
-        }
-
-        if (comm_plan_->num_send_ranks > 0) {
-            send_counts_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_counts");
-            send_displs_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_displs");
-            
-            for(int i = 0; i < comm_plan_->num_send_ranks; i++){
-                send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_;
-                send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_;
-            }
-            send_counts_.update_device();
-            send_displs_.update_device();
-        }
-        
-        if (comm_plan_->num_recv_ranks > 0) {
-            recv_counts_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_counts");
-            recv_displs_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_displs");
-            
-            for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
-                recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_;
-                recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_;
-            }
-            recv_counts_.update_device();
-            recv_displs_.update_device();
-        }
-    };
-
-
-    // GPU Method
-    // Method that returns size
-    KOKKOS_INLINE_FUNCTION
-    size_t size() const;
-
-    // Host Method
-    // Method that returns size
-    KOKKOS_INLINE_FUNCTION
-    size_t extent() const;
-
-    KOKKOS_INLINE_FUNCTION
-    size_t dims(size_t i) const;
-
-    KOKKOS_INLINE_FUNCTION
-    size_t order() const;
- 
-    // Method returns the raw device pointer of the Kokkos DualView
-    KOKKOS_INLINE_FUNCTION
-    T* device_pointer() const;
-
-    // Method returns the raw host pointer of the Kokkos DualView
-    KOKKOS_INLINE_FUNCTION
-    T* host_pointer() const;
-
-    // Method returns kokkos dual view
-    KOKKOS_INLINE_FUNCTION
-    Kokkos::DualView<T*, Layout, ExecSpace, MemoryTraits> get_kokkos_dual_view() const;
-
-    // Method that update host view
-    void update_host();
-
-    // Method that update device view
-    void update_device();
-
-    // Method that builds the send buffer, note, this has to be ordered
-    // Such that all the boundary elements going to a given rank are contiguous in the send buffer.
-    void fill_send_buffer(){
-
-        size_t send_idx = 0;
-        for(int i = 0; i < comm_plan_->num_send_ranks; i++){
-            for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
-                size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send
-                
-                // Copy all values associated with this element (handles multi-dimensional arrays)
-                for(size_t k = 0; k < stride_; k++){
-                    send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k];
-                }
-                send_idx += stride_;
-            }
-        }
-    };
-
-    // Method that copies the recv buffer into the this_array
-    void copy_recv_buffer(){
-
-        size_t recv_idx = 0;
-        for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
-            for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){
-                size_t dest_idx = comm_plan_->recv_indices_.host(i, j);
-                
-                // Copy all values associated with this element (handles multi-dimensional arrays)
-                for(size_t k = 0; k < stride_; k++){
-                    this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k);
-                }
-                
-                recv_idx += stride_;
-            }
-        }
-        this_array_.update_device();
-    };
-
-
-    // Note: This "may" be needed, im not sure.  Currently, it works....
-        // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior)
-        // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr;
-        // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr;
-        // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr;
-        // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr;
-        // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr;
-        // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
-
-    // Method that communicates the data between the ranks
-    // NOTE: This is a blocking communication operation, 
-    // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv
-    void communicate(){
-
-        this_array_.update_host();
-       
-        fill_send_buffer();
-        
-        MPI_Neighbor_alltoallv(
-            send_buffer_.host_pointer(),
-            send_counts_.host_pointer(),
-            send_displs_.host_pointer(),
-            mpi_type_map<T>::value(),  // MPI_TYPE
-            recv_buffer_.host_pointer(),
-            recv_counts_.host_pointer(),
-            recv_displs_.host_pointer(), 
-            mpi_type_map<T>::value(),  // MPI_TYPE
-            comm_plan_->mpi_comm_graph);
-        
-        copy_recv_buffer();
-
-        this_array_.update_device();
-    };
-
-    void set_values(const T& value){
-        this_array_.set_values(value);
-    };
-
-    // Deconstructor
-    virtual KOKKOS_INLINE_FUNCTION
-    ~MPICArrayKokkos ();
-}; // End of MPIDArrayKokkos
-
-// Default constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
-    : this_array_(), stride_(1), length_(0), order_(0) {
-        for (int i = 0; i < 7; i++) {
-            dims_[i] = 0;
-        }
-    }
-
-// Overloaded 1D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) 
-    : stride_(1), length_(dim0), order_(1) {
-    dims_[0] = dim0;
-    this_array_ = DCArrayKokkos<T>(dim0, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0);
-}
-
-// Overloaded 2D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) 
-    : stride_(dim1), length_(dim0 * dim1), order_(2) {
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-
-    this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1);
-}
-
-// Overloaded 3D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) 
-    : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) {
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-    dims_[2] = dim2;
-    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2);
-}
-
-// Overloaded 4D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) {
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-    dims_[2] = dim2;
-    dims_[3] = dim3;
-    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3);
-}
-
-// Overloaded 5D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) {
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-    dims_[2] = dim2;
-    dims_[3] = dim3;
-    dims_[4] = dim4;
-    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4);
-}
-
-// Overloaded 6D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) {
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-    dims_[2] = dim2;
-    dims_[3] = dim3;
-    dims_[4] = dim4;
-    dims_[5] = dim5;
-    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5);
-}
-
-// Overloaded 7D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) 
-    : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) {
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-    dims_[2] = dim2;
-    dims_[3] = dim3;
-    dims_[4] = dim4;
-    dims_[5] = dim5;
-    dims_[6] = dim6;
-    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
-    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
-}
-
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
-    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!");
-    return this_array_(i);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
-    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!");
-    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!");
-    return this_array_(i, j);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
-    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!");
-    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!");
-    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!");
-    return this_array_(i, j, k);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
-    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!");
-    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!");
-    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!");
-    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!");
-    return this_array_(i, j, k, l);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const {
-    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!");
-    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!");
-    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!");
-    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!");
-    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!");
-    return this_array_(i, j, k, l, m);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const {
-    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!");
-    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!");
-    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!");
-    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!");
-    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!");
-    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!");
-    return this_array_(i, j, k, l, m, n);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const {
-    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!");
-    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!");
-    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!");
-    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!");
-    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!");
-    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!");
-    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!");
-    assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!");
-    return this_array_(i, j, k, l, m, n, o);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPICArrayKokkos& temp) {
-    this_array_ = temp.this_array_;
-    host = temp.host;  // Also copy the host ViewCArray
-    comm_plan_ = temp.comm_plan_;
-    send_buffer_ = temp.send_buffer_;
-    recv_buffer_ = temp.recv_buffer_;
-    stride_ = temp.stride_;
-    return *this;
-}
-
-// Return size
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
-    return this_array_.size();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
-    return this_array_.extent();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
-    assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
-    assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!");
-    return this_array_.dims(i);
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
-    return this_array_.order();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
-    return this_array_.device_pointer();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
-    return this_array_.host_pointer();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
-    return this_array_.get_kokkos_dual_view();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
-    this_array_.update_host();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
-    this_array_.update_device();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
-
-}
-
-} // end namespace mtr
-
-
-// #endif // end if have MPI
-#endif // end if MPICARRAYKOKKOS_H
\ No newline at end of file

From 9748087165154acd01d8effffe20ff92c1283929 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 17 Nov 2025 14:00:58 -0600
Subject: [PATCH 31/52] ENH: Pulling out build ghost function

---
 examples/mesh_decomp/decomp_utils.h  | 2488 +++++++++++++-------------
 examples/mesh_decomp/mesh_decomp.cpp |    2 +-
 2 files changed, 1246 insertions(+), 1244 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index f0e7ae4d..5cdf4a6f 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -552,1480 +552,1482 @@ void naive_partition_mesh(
     return;
 }
 
-
-/**
- * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh.
- *
- * This function performs parallel mesh partitioning using a two-stage approach:
- *   1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks).
- *   2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity.
- *
- * The partitioned mesh, nodal data, and associated connectivity/gauss point information
- * are distributed among MPI ranks as a result. The procedure ensures that each rank receives
- * its assigned portion of the mesh and associated data in the final (target) decomposition.
- *
- * @param initial_mesh[in]  The input (global) mesh, present on rank 0 or all ranks at start.
- * @param final_mesh[out]   The mesh assigned to this rank after PT-Scotch decomposition.
- * @param initial_node[in]  Nodal data for the input (global) mesh; must match initial_mesh.
- * @param final_node[out]   Nodal data for this rank after decomposition (corresponds to final_mesh).
- * @param gauss_point[out]  Gauss point data structure, filled out for this rank's mesh.
- * @param world_size[in]    Number of MPI ranks in use (the total number of partitions).
- * @param rank[in]          This process's MPI rank ID.
- *
- * Internals:
- * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition.
- * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout.
- * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information,
- *   are managed and exchanged across ranks.
- * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition.
- */
-
-void partition_mesh(
-    Mesh_t& initial_mesh,
-    Mesh_t& final_mesh,
-    node_t& initial_node,
-    node_t& final_node,
-    GaussPoint_t& gauss_point,
+void build_ghost(
+    Mesh_t& input_mesh,
+    Mesh_t& output_mesh,
+    node_t& input_node,
+    node_t& output_node,
+    CommunicationPlan& element_communication_plan,
     int world_size,
-    int rank){
-
+    int rank)
+{
     bool print_info = false;
-    bool print_vtk = false;
+    // ****************************************************************************************** 
+    //     Build the ghost elements and nodes
+    // ================================================================================================**
+    //
+    // OVERVIEW OF GHOST ELEMENT IDENTIFICATION:
+    // ==========================================
+    // In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh
+    // elements. However, to perform computations that depend on element neighbors or to maintain
+    // consistency at domain boundaries, we need ghost elements: copies of elements from neighboring
+    // ranks that share nodes with our locally-owned elements.
+    //
+    // This algorithm identifies and extracts ghost element data in 5 steps:
+    //  1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv)
+    //  2. Collect local element-node connectivity for distribution
+    //  3. Broadcast connectivity to all ranks (via MPI_Allgatherv)
+    //  4. Identify which remote elements touch our local elements
+    //  5. Extract the full connectivity data for identified ghost elements
+    double t_ghost_start = MPI_Wtime();
+        
+    // ========================================================================
+    // STEP 1: Gather element ownership information from all ranks
+    // ========================================================================
+    // In a distributed mesh, each rank owns a subset of elements. To identify
+    // ghost elements (elements from other ranks needed by this rank), we need
+    // to know which rank owns each element. This section uses MPI collective
+    // operations to gather element GID ownership information.
+    //
+    // MPI COLLECTIVE OPERATIONS EXPLAINED:
+    // ====================================
+    // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point
+    // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank
+    //   Input: Each rank provides local data
+    //   Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...)
+    // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data
+    //   Input: Each rank provides data of potentially different sizes
+    //   Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes
+    //
+    // COMMUNICATION PATTERN VISUALIZATION:
+    // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...]
+    // Rank 1: elem_count[1] /
+    // Rank 2: elem_count[2] /
 
-    // Create mesh, gauss points, and node data structures on each rank
-    // This is the initial partitioned mesh
-    Mesh_t naive_mesh;
-    node_t naive_node;
+    int nodes_per_elem = input_mesh.num_nodes_in_elem;
 
-    // Mesh partitioned by pt-scotch, not including ghost
-    Mesh_t intermediate_mesh; 
-    node_t intermediate_node;
+    // MPI_Allgather: Each rank sends its element count, every rank receives
+    // the count from every other rank. Result: elem_counts[r] = number of
+    // elements owned by rank r.
+    std::vector<int> elem_counts(world_size);
+    MPI_Allgather(&input_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);  // Synchronize all ranks before proceeding
 
+    // Compute displacements: offset into the global array for each rank's data
+    // Example: if elem_counts = [100, 150, 120], then
+    // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids)
+    std::vector<int> elem_displs(world_size);
+    int total_elems = 0;
+    for (int r = 0; r < world_size; r++) {
+        elem_displs[r] = total_elems;
+        total_elems += elem_counts[r];
+    }
 
-    // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh
-    std::vector<int> elems_in_elem_on_rank;
-    std::vector<int> num_elems_in_elem_per_rank;
+    // MPI_Allgatherv: Gather variable-sized data from all ranks into one array
+    // Each rank contributes its local_to_global_elem_mapping, which maps
+    // local element indices to global element GIDs. After this call,
+    // all_elem_gids contains ALL element GIDs from all ranks, organized by rank.
+    std::vector<size_t> all_elem_gids(total_elems);
+    MPI_Allgatherv(input_mesh.local_to_global_elem_mapping.host_pointer(), input_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
+                all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 
+    // Build a lookup map: element GID -> owning rank
+    // This allows O(log n) lookups to determine which rank owns any given element.
+    std::map<size_t, int> elem_gid_to_rank;
+    for (int rank_id = 0; rank_id < world_size; rank_id++) {
+        for (int i = 0; i < elem_counts[rank_id]; i++) {
+            size_t gid = all_elem_gids[elem_displs[rank_id] + i];
+            elem_gid_to_rank[gid] = rank_id;
+        }
+    }
 
-    // Perform the naive partitioning of the mesh
-    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
+    // ========================================================================
+    // STEP 2: Build element-to-node connectivity for local elements
+    // ========================================================================
+    // Ghost elements are elements from other ranks that share nodes with our
+    // locally-owned elements. To identify them, we need to exchange element-node
+    // connectivity information with all other ranks.
 
+    // Collect all nodes that belong to our locally-owned elements
+    // This set will be used later to check if a remote element is relevant
+    std::set<size_t> local_elem_nodes;
+    for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) {
+        size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid);
+        local_elem_nodes.insert(node_gid);
+    }
 
-    /**********************************************************************************
-     * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
-     **********************************************************************************
-     *
-     * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch
-     * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges
-     * correspond to mesh-neighbor relationships (i.e., elements that share a face or are
-     * otherwise neighbors per your mesh definition).
-     *
-     * We use the compact CSR (Compressed Sparse Row) representation, passing only the
-     * essential information required by PT-Scotch.
-     * 
-     * Variables and structures used:
-     *   - SCOTCH_Dgraph dgraph:
-     *       The distributed graph instance managed by PT-Scotch. Each MPI rank creates
-     *       and fills in its portion of the global graph.
-     * 
-     *   - const SCOTCH_Num baseval:
-     *       The base value for vertex and edge numbering. Set to 0 for C-style zero-based
-     *       arrays. Always use 0 unless you are using Fortran style 1-based arrays.
-     * 
-     *   - const SCOTCH_Num vertlocnbr:
-     *       The *number of local vertices* (mesh elements) defined on this MPI rank.
-     *       In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify
-     *       its own local vertex count.
-     *
-     *   - const SCOTCH_Num vertlocmax:
-     *       The *maximum number of local vertices* that could be stored (capacity). We
-     *       allocate with no unused holes, so vertlocmax = vertlocnbr.
-     *
-     *   - std::vector<SCOTCH_Num> vertloctab:
-     *       CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i]
-     *       gives the index in edgeloctab where the neighbor list of vertex i begins.
-     *       PT-Scotch expects this array to be of size vertlocnbr+1, where the difference
-     *       vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i.
-     *
-     *   - std::vector<SCOTCH_Num> edgeloctab:
-     *       CSR array [variable size]: a flattened list of *neighboring element global IDs*,
-     *       in no particular order. For vertex i, its neighbors are located at
-     *       edgeloctab[vertloctab[i]...vertloctab[i+1]-1].
-     *       In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to
-     *       recognize edges both within and across ranks.
-     *
-     *   - std::map<int, size_t> elem_gid_to_offset:
-     *       Helper map: For a given element global ID, gives the starting offset in 
-     *       the flattened neighbor array (elems_in_elem_on_rank) where this element's
-     *       list of neighbor GIDs begins. This allows efficient neighbor list lookup.
-     *
-     *   - (other arrays used, from mesh setup and communication phase)
-     *       - elements_on_rank: vector of global element IDs owned by this rank.
-     *       - num_elements_on_rank: number of owned elements.
-     *       - num_elems_in_elem_per_rank: array, for each owned element, how many
-     *         neighbors it has.
-     *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
-     *
-    **********************************************************************************/
+    // ========================================================================
+    // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv
+    // ========================================================================
+    // Build a flattened connectivity array: pairs of (elem_gid, node_gid)
+    // Example for 2 elements with 8 nodes each:
+    //   elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+    //
+    // This format is chosen because it's easy to serialize and deserialize over MPI,
+    // and allows us to reconstruct the full element-node relationships.
+    std::vector<size_t> elem_node_conn;
+    int local_conn_size = 0;
 
-    // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
-    SCOTCH_Dgraph dgraph;
-    if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
-        MPI_Abort(MPI_COMM_WORLD, 1);
+    // For each locally-owned element, record its GID and all its node GIDs
+    for (int lid = 0; lid < input_mesh.num_elems; lid++) {
+        size_t elem_gid = input_mesh.local_to_global_elem_mapping.host(lid);
+        
+        // Access nodes_in_elem[lid][*] to get all nodes in this element
+        for (int j = 0; j < input_mesh.num_nodes_in_elem; j++) {
+            size_t node_lid = input_mesh.nodes_in_elem.host(lid, j);  // Local index
+            size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid);  // Global index
+            
+            elem_node_conn.push_back(elem_gid);
+            elem_node_conn.push_back(node_gid);
+        }
+        local_conn_size += nodes_per_elem * 2;  // Each element contributes (num_nodes_in_elem * 2) size_ts
     }
 
-    // Set base value for numbering (0 for C-style arrays)
-    const SCOTCH_Num baseval = 0;
-
-    // vertlocnbr: Number of elements (vertices) that are local to this MPI rank
-    const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(naive_mesh.num_elems);
 
-    // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr)
-    const SCOTCH_Num vertlocmax = vertlocnbr;
 
-    // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) ---
-    // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins
-    std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
+    // ========================================================================
+    // Perform MPI communication to gather connectivity from all ranks
+    // ========================================================================
+    // Similar to Step 1, we use MPI_Allgatherv to collect all element-node
+    // connectivity pairs. This is a two-stage process:
+    // 1) Gather the size of each rank's connectivity data
+    // 2) Gather the actual connectivity data with proper offsets
 
-    // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
-    std::vector<SCOTCH_Num> edgeloctab;
-    edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance
+    // Stage 1: Gather connectivity sizes from each rank
+    // conn_sizes[r] = number of size_t values that rank r will send
+    std::vector<int> conn_sizes(world_size);
+    MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
-    // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
-    std::map<int, size_t> elem_gid_to_offset;
-    size_t current_offset = 0;
-    for (size_t k = 0; k < naive_mesh.num_elems; k++) {
-        int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
-        elem_gid_to_offset[elem_gid_on_rank] = current_offset;
-        current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH
+    // Compute displacements for the second MPI_Allgatherv call
+    // Displcements tell each rank where its data should be placed in the global array
+    std::vector<int> conn_displs(world_size);
+    int total_conn = 0;
+    for (int r = 0; r < world_size; r++) {
+        conn_displs[r] = total_conn;
+        total_conn += conn_sizes[r];
     }
 
-    // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
-    SCOTCH_Num offset = 0; // running count of edges encountered
-
-    for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
+    // Stage 2: Gather all element-node connectivity data
+    // After this call, all_conn contains the flattened connectivity from every rank,
+    // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r])
+    std::vector<size_t> all_conn(total_conn);
+    MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
+                all_conn.data(), conn_sizes.data(), conn_displs.data(),
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 
-        // Record current edge offset for vertex lid in vertloctab
-        vertloctab[lid] = offset;
+    // ========================================================================
+    // STEP 4: Identify ghost elements
+    // ========================================================================
+    // A ghost element is an element owned by another rank that shares at least
+    // one node with our locally-owned elements. This step identifies all such elements.
 
-        // Obtain this local element's global ID (from mapping)
-        int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+    // Build a set of locally-owned element GIDs for quick lookup
+    std::set<size_t> local_elem_gids;
+    for (int i = 0; i < input_mesh.num_elems; i++) {
+        local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i));
+    }
 
-        // Find offset in the flattened neighbor array for this element's neighbor list
-        size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
+    // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it
+    // This helps us identify which remote elements are adjacent to our local elements
+    std::map<size_t, std::set<size_t>> node_to_ext_elem;
 
-        // For this element, find the count of its neighbors
-        // This requires finding its index in the elements_on_rank array
-        size_t idx = 0;
-        for (size_t k = 0; k < naive_mesh.num_elems; k++) {
-            int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
-            if (elem_gid_on_rank == elem_gid) {
-                idx = k;
-                break;
+    // Iterate through connectivity data from each rank (except ourselves)
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Offset into all_conn for this pair (elem_gid, node_gid)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // Check if this node belongs to one of our locally-owned elements
+            if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
+                // Check if this element is NOT owned by us (i.e., it's from another rank)
+                if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
+                    // This is a ghost element for us
+                    node_to_ext_elem[node_gid].insert(elem_gid);
+                }
             }
         }
-        size_t num_nbrs = num_elems_in_elem_per_rank[idx];
+    }
 
-        // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
-        for (size_t j = 0; j < num_nbrs; j++) {
-            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
-            edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
-            ++offset; // Increment running edge count
+    // Extract all unique ghost element GIDs
+    // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us)
+    std::set<size_t> ghost_elem_gids;
+    for (const auto& pair : node_to_ext_elem) {
+        for (size_t elem_gid : pair.second) {
+            ghost_elem_gids.insert(elem_gid);
         }
     }
 
-    // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure
-    vertloctab[vertlocnbr] = offset;
+    // Additional check: elements that are neighbors of our locally-owned elements
+    // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
 
-    // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally
-    // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint)
-    const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees)
-    const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints
+    // for (int lid = 0; lid < num_new_elems; lid++) {
+    //     size_t num_neighbors = input_mesh.num_elems_in_elem(lid);
+        
+    //     for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
+    //         size_t neighbor_lid = input_mesh.elems_in_elem(lid, nbr_idx);
+            
+    //         if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
+    //             size_t neighbor_gid = input_mesh.local_to_global_elem_mapping(neighbor_lid);
+                
+    //             // Check if neighbor is owned by this rank
+    //             auto it = elem_gid_to_rank.find(neighbor_gid);
+    //             if (it != elem_gid_to_rank.end() && it->second != rank) {
+    //                 // Neighbor is owned by another rank - it's a ghost for us
+    //                 std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl;
+    //                 ghost_elem_gids.insert(neighbor_gid);
+    //             }
+    //         }
+    //     }
+    // }
+
+    // Store the count of ghost elements for later use
+    input_mesh.num_ghost_elems = ghost_elem_gids.size();
 
-    // Optionally print graph structure for debugging/validation
-    if (print_info) {
-        std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr
-                  << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl;
-        std::cout << "vertloctab (CSR row offsets): ";
-        for (size_t i = 0; i <= vertlocnbr; i++) {
-            std::cout << vertloctab[i] << " ";
-        }
-        std::cout << std::endl;
-        std::cout << "edgeloctab (first 20 neighbor GIDs): ";
-        for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
-            std::cout << edgeloctab[i] << " ";
-        }
-        std::cout << std::endl;
-    }
     MPI_Barrier(MPI_COMM_WORLD);
 
-    /**************************************************************************
-     * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild
-     *
-     *   - PT-Scotch will use our CSR arrays. Since we use compact representation,
-     *     most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab")
-     *     can be passed as nullptr.
-     *   - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this
-     *     to discover connections across processor boundaries, so you do not have to
-     *     encode ownership or partition information yourself.
-     **************************************************************************/
-    int rc = SCOTCH_dgraphBuild(
-                &dgraph,
-                baseval,                // start index (0)
-                vertlocnbr,             // local vertex count (local elements)
-                vertlocmax,             // local vertex max (no holes)
-                vertloctab.data(),      // row offsets in edgeloctab
-                /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr)
-                /*veloloctab*/ nullptr, // vertex weights, not used
-                /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab)
-                edgelocnbr,             // local edge endpoints count
-                edgelocsiz,             // size of edge array
-                edgeloctab.data(),      // global neighbor IDs for each local node
-                /*edgegsttab*/ nullptr, // ghost edge array, not used
-                /*edloloctab*/ nullptr  // edge weights, not used
-    );
-    if (rc != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
-        SCOTCH_dgraphFree(&dgraph);
-        MPI_Abort(MPI_COMM_WORLD, rc);
+
+    // ========================================================================
+    // STEP 5: Extract ghost element connectivity
+    // ========================================================================
+    // Now that we know which elements are ghosts, we need to extract their
+    // full node connectivity from all_conn. This allows us to properly construct
+    // the extended mesh with ghost elements included.
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
+
+    // Build a map: ghost_elem_gid -> vector of node_gids
+    // We pre-allocate the vector size to avoid repeated reallocations
+    std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
+    for (const size_t& ghost_gid : ghost_elem_gids) {
+        ghost_elem_to_nodes[ghost_gid].reserve(input_mesh.num_nodes_in_elem);
     }
 
-    // Optionally, print rank summary after graph build for further validation
-    if (print_info) {
-        SCOTCH_Num vertlocnbr_out;
-        SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
-        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl;
+    // ========================================================================
+    // Extract nodes for each ghost element from the globally-collected all_conn
+    // ========================================================================
+    // The all_conn array was populated by MPI_Allgatherv and contains connectivity
+    // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse
+    // this data to extract the nodes for each ghost element.
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;  // Skip our own data - we already have owned element connectivity
+        
+        // Parse connectivity data for rank r
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Calculate offset for this pair: displacement + (pair_index * 2)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this element is one of our identified ghost elements, record its node
+            auto it = ghost_elem_to_nodes.find(elem_gid);
+            if (it != ghost_elem_to_nodes.end()) {
+                it->second.push_back(node_gid);
+            }
+        }
     }
-    MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<<std::endl;
+    // ========================================================================
+    // Validation: Verify each ghost element has the correct number of nodes
+    // ========================================================================
+    // This catch detects issues in the MPI communication or parsing logic
+    for (auto& pair : ghost_elem_to_nodes) {
+        if (pair.second.size() != static_cast<size_t>(input_mesh.num_nodes_in_elem)) {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
+                    << " has " << pair.second.size() << " nodes, expected " << input_mesh.num_nodes_in_elem << std::endl;
+        }
+    }
 
-    /********************************************************
-     * Step 5: Validate the graph using SCOTCH_dgraphCheck
-     ********************************************************/
-    rc = SCOTCH_dgraphCheck(&dgraph);
-    if (rc != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
-        SCOTCH_dgraphFree(&dgraph);
-        MPI_Abort(MPI_COMM_WORLD, rc);
+    // Step 2: Build extended node list (owned nodes first, then ghost-only nodes)
+    // Start with owned nodes
+    std::map<size_t, int> node_gid_to_extended_lid;
+    int extended_node_lid = 0;
+
+    // Add all owned nodes
+    for (int i = 0; i < input_mesh.num_nodes; i++) {
+        size_t node_gid = input_mesh.local_to_global_node_mapping.host(i);
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
     }
 
-    /**************************************************************
-     * Step 6: Partition (repartition) the mesh using PT-Scotch
-     * - Each vertex (mesh element) will be assigned a part (mesh chunk).
-     * - Arch is initialized for a complete graph of world_size parts (one per rank).
-     **************************************************************/
-    // SCOTCH_Arch controls the "architecture" for partitioning: the topology
-    // (number and connectivity of parts) to which the graph will be mapped.
-    // The archdat variable encodes this. Below are common options:
-    //
-    // - SCOTCH_archCmplt(&archdat, nbparts)
-    //     * Creates a "complete graph" architecture with nbparts nodes (fully connected).
-    //       Every part is equally distant from every other part.
-    //       This is typically used when minimizing only *balance* and *edge cut*,
-    //       not considering any underlying machine topology.
-    //
-    // - SCOTCH_archHcub(&archdat, dimension)
-    //     * Hypercube architecture (rare in modern use).
-    //       Sets up a hypercube of given dimension.
-    //
-    // - SCOTCH_archTleaf / SCOTCH_archTleafX
-    //     * Tree architectures, for hierarchically structured architectures.
-    //
-    // - SCOTCH_archMesh2 / SCOTCH_archMesh3
-    //     * 2D or 3D mesh topology architectures (useful for grid/matrix machines).
-    //
-    // - SCOTCH_archBuild
-    //     * General: builds any architecture from a descriptor string.
-    //
-    // For distributed mesh partitioning to MPI ranks (where all ranks are equal),
-    // the most common and appropriate is "complete graph" (Cmplt): each part (rank)
-    // is equally reachable from any other (no communication topology bias).
-    SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
-    SCOTCH_archInit(&archdat);
-    // Partition into 'world_size' equally connected parts (each MPI rank is a "node")
-    // Other topology options could be substituted above according to your needs (see docs).
-    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
+    // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements)
+    std::set<size_t> ghost_only_nodes;
+    for (const auto& pair : ghost_elem_to_nodes) {
+        for (size_t node_gid : pair.second) {
+            // Check if we already have this node
+            if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) {
+                ghost_only_nodes.insert(node_gid);
+            }
+        }
+    }
 
+    // Assign extended local IDs to ghost-only nodes
+    for (size_t node_gid : ghost_only_nodes) {
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
 
+    int total_extended_nodes = extended_node_lid;
 
-    
-    // ===================== PT-Scotch Strategy Selection and Documentation ======================
-    // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
-    // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
-    //
-    // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation):
-    //
-    // - SCOTCH_STRATDEFAULT:     Use the default (fast, reasonable quality) partitioning strategy.
-    //                            Useful for quick, generic partitions where quality is not critical.
-    //
-    // - SCOTCH_STRATSPEED:       Aggressively maximizes speed (at the cost of cut quality).
-    //                            For large runs or test runs where speed is more important than minimizing edgecut.
-    //
-    // - SCOTCH_STRATQUALITY:     Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance).
-    //                            Slower than the default. Use when high-quality partitioning is desired.
-    //
-    // - SCOTCH_STRATBALANCE:     Tradeoff between speed and quality for balanced workload across partitions.
-    //                            Use if load balance is more critical than cut size.
-    //
-    // Additional Options:
-    // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}").
-    // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic).
-    // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality.
-    //
-    // Example usage:
-    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01);
-    //      ^ quality-focused, nparts=number of parts/ranks
-    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05);
-    //      ^ speed-focused, allow 5% imbalance
-    //
-    // Reference:
-    // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf
-    // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation.
-    //
-    // --------------- Set up the desired partitioning strategy here: ---------------
-    SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
-    SCOTCH_stratInit(&stratdat);
+    // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
+    // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
+    std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
+    for (size_t node_gid : ghost_only_nodes) {
+        // Find which rank owns an element containing this node
+        // Look through ghost elements
+        for (const auto& pair : ghost_elem_to_nodes) {
+            size_t ghost_elem_gid = pair.first;
+            const std::vector<size_t>& nodes = pair.second;
+            bool found = false;
+            for (size_t ngid : nodes) {
+                if (ngid == node_gid) {
+                    found = true;
+                    break;
+                }
+            }
+            if (found) {
+                auto owner_it = elem_gid_to_rank.find(ghost_elem_gid);
+                if (owner_it != elem_gid_to_rank.end()) {
+                    rank_to_ghost_node_requests[owner_it->second].push_back(node_gid);
+                    break;
+                }
+            }
+        }
+    }
 
-    // Select partitioning strategy for this run:
-    // Use SCOTCH_STRATQUALITY for best cut quality.
-    // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
-    // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
-    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001);
+    // Step 4: Build extended element list and node connectivity
+    // Owned elements: 0 to num_new_elems-1 (already have these)
+    // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
 
-    // partloctab: output array mapping each local element (vertex) to a *target partition number*
-    // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
-    std::vector<SCOTCH_Num> partloctab(vertlocnbr);
-    rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
-    if (rc != 0) {
-        std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n";
-        SCOTCH_stratExit(&stratdat);
-        SCOTCH_archExit(&archdat);
-        SCOTCH_dgraphFree(&dgraph);
-        MPI_Abort(MPI_COMM_WORLD, rc);
+    // Create extended element-node connectivity array
+    int total_extended_elems = input_mesh.num_elems + input_mesh.num_ghost_elems;
+    std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
+
+    // Copy owned element connectivity (convert to extended node LIDs)
+    for (int lid = 0; lid < input_mesh.num_elems; lid++) {
+        extended_nodes_in_elem[lid].reserve(nodes_per_elem);
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = input_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid);
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[lid].push_back(ext_lid);
+        }
     }
 
-    // Clean up PT-Scotch strategy and architecture objects
-    SCOTCH_stratExit(&stratdat);
-    SCOTCH_archExit(&archdat);
-    
-    // Free the graph now that we have the partition assignments
-    SCOTCH_dgraphFree(&dgraph);
+    // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
+    int ghost_elem_ext_lid = input_mesh.num_elems;
+    std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
+    std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
 
-    /***************************************************************************
-     * Step 7 (Optional): Print out the partitioning assignment per element
-     * - Each local element's local index lid and global ID (gid) are listed with the
-     *   part to which PT-Scotch has assigned them.
-     ***************************************************************************/
-    print_info = false;
-    for(int rank_id = 0; rank_id < world_size; rank_id++) {
-        if(rank_id == rank && print_info) {
-            for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
-                size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid);
-                std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
-                        << " -> part=" << partloctab[lid] << "\n";
+    for (size_t ghost_gid : ghost_elem_gids_ordered) {
+        auto it = ghost_elem_to_nodes.find(ghost_gid);
+        if (it == ghost_elem_to_nodes.end()) continue;
+        
+        extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem);
+        for (size_t node_gid : it->second) {
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid);
+        }
+        ghost_elem_ext_lid++;
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Sequential rank-wise printing of extended mesh structure info
+    if(print_info) {
+        for (int r = 0; r < world_size; r++) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned elements: " << input_mesh.num_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
+                std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned nodes: " << input_mesh.num_nodes << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
+                std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
+                std::cout << std::flush;
             }
             MPI_Barrier(MPI_COMM_WORLD);
         }
-        MPI_Barrier(MPI_COMM_WORLD);
     }
-    print_info = false;
+    // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
+    // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
 
-// ****************************************************************************************** 
-//     Build the final mesh from the repartition
-// ****************************************************************************************** 
+    // Build reverse maps: extended_lid -> gid for nodes and elements
+    std::vector<size_t> extended_lid_to_node_gid(total_extended_nodes);
+    for (const auto& pair : node_gid_to_extended_lid) {
+        extended_lid_to_node_gid[pair.second] = pair.first;
+    }
 
+    // Build extended element GID list: owned first, then ghost
+    std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
 
+    // Owned elements
+    for (int i = 0; i < input_mesh.num_elems; i++) {
+        extended_lid_to_elem_gid[i] = input_mesh.local_to_global_elem_mapping.host(i);
+    }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n";
-    MPI_Barrier(MPI_COMM_WORLD);
+    // Ghost elements (in sorted order)
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        extended_lid_to_elem_gid[input_mesh.num_elems + i] = ghost_elem_gids_ordered[i];
+    }
 
-    // -------------- Phase 1: Determine elements to send to each rank --------------
-    std::vector<std::vector<int>> elems_to_send(world_size);
-    for (int lid = 0; lid < naive_mesh.num_elems; lid++) {
-        int dest = static_cast<int>(partloctab[lid]);
-        int elem_gid = static_cast<int>(naive_mesh.local_to_global_elem_mapping.host(lid));
-        elems_to_send[dest].push_back(elem_gid);
+    // Build array: for each ghost element, store which rank owns it (where to receive data from)
+    std::vector<int> ghost_elem_owner_ranks(ghost_elem_gids_ordered.size());
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        size_t ghost_gid = ghost_elem_gids_ordered[i];
+        auto it = elem_gid_to_rank.find(ghost_gid);
+        if (it != elem_gid_to_rank.end()) {
+            ghost_elem_owner_ranks[i] = it->second;
+        } else {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid 
+                    << " not found in elem_gid_to_rank map!" << std::endl;
+            ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator
+        }
     }
 
-    // -------------- Phase 2: Exchange element GIDs --------------
-    std::vector<int> sendcounts(world_size), recvcounts(world_size);
-    for (int r = 0; r < world_size; r++)
-        sendcounts[r] = static_cast<int>(elems_to_send[r].size());
+    // Create a std::set of all the ranks this rank will receive data from
+    std::set<int> ghost_elem_receive_ranks;
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]);
+    }
 
-    MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    // ****************************************************************************************** 
+    //     Build the final partitioned mesh
+    // ****************************************************************************************** 
 
-    MPI_Barrier(MPI_COMM_WORLD);
 
-    // Compute displacements
-    std::vector<int> sdispls(world_size), rdispls(world_size);
-    int send_total = 0, recv_total = 0;
-    for (int r = 0; r < world_size; r++) {
-        sdispls[r] = send_total;
-        rdispls[r] = recv_total;
-        send_total += sendcounts[r];
-        recv_total += recvcounts[r];
+    output_mesh.initialize_nodes(total_extended_nodes);
+    output_mesh.initialize_elems(total_extended_elems, 3);
+    output_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
+    output_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
+    for (int i = 0; i < total_extended_nodes; i++) {
+        output_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
+    }
+    for (int i = 0; i < total_extended_elems; i++) {
+        output_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
     }
+    output_mesh.local_to_global_node_mapping.update_device();
+    output_mesh.local_to_global_elem_mapping.update_device();
 
+    output_mesh.num_ghost_elems = ghost_elem_gids.size();
+    output_mesh.num_ghost_nodes = ghost_only_nodes.size();
 
-    // Flatten send buffer
-    // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks.
-    // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning.
-    std::vector<int> send_elems;
-    send_elems.reserve(send_total);
-    for (int r = 0; r < world_size; r++)
-        send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end());
 
-    // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange.
-    // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank.
-    std::vector<int> new_elem_gids(recv_total);
-    MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT,
-                new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
-    
+    output_mesh.num_owned_elems = input_mesh.num_elems;
+    output_mesh.num_owned_nodes = input_mesh.num_nodes;
+
     MPI_Barrier(MPI_COMM_WORLD);
-    
-    // New elements owned by this rank
-    int num_new_elems = static_cast<int>(new_elem_gids.size());
-    
-    if (print_info) {
-        std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
+    // rebuild the local element-node connectivity using the local node ids
+    // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
+    for(int i = 0; i < total_extended_elems; i++) {
+        for(int j = 0; j < nodes_per_elem; j++) {
+            output_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
+        }
     }
 
-    // -------------- Phase 3: Send element–node connectivity --------------
-    int nodes_per_elem = naive_mesh.num_nodes_in_elem;
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    // Flatten element-node connectivity by global node IDs
-    std::vector<int> conn_sendbuf;
-    for (int r = 0; r < world_size; r++) {
-        for (int elem_gid : elems_to_send[r]) {
-            // find local element lid from elem_gid
-            int lid = -1;
-            for (int i = 0; i < naive_mesh.num_elems; i++)
-                if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; }
+    double t_ghost_end = MPI_Wtime();
 
-            for (int j = 0; j < nodes_per_elem; j++) {
-                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
-                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
-                conn_sendbuf.push_back(node_gid);
-            }
-        }
+    if (rank == 0) {
+        std::cout << " Finished calculating ghost elements" << std::endl;
+        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
     }
 
-    // element-node connectivity counts (ints per dest rank)
-    std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
-    for (int r = 0; r < world_size; r++)
-        conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
-
-    MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-
+    output_mesh.nodes_in_elem.update_device();
+    output_mesh.build_connectivity();
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"<<std::endl;
 
-    std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
-    int conn_send_total = 0, conn_recv_total = 0;
-    for (int r = 0; r < world_size; r++) {
-        conn_sdispls[r] = conn_send_total;
-        conn_rdispls[r] = conn_recv_total;
-        conn_send_total += conn_sendcounts[r];
-        conn_recv_total += conn_recvcounts[r];
-    }
+    if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    std::vector<int> conn_recvbuf(conn_recv_total);
-    MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT,
-                conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+    // ****************************************************************************************** 
+    //     Build the final nodes that include ghost
+    // ****************************************************************************************** 
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"<<std::endl;
 
-    // -------------- Phase 4: Build new node list (unique GIDs) --------------
-    std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
-    std::vector<int> new_node_gids(node_gid_set.begin(), node_gid_set.end());
-    int num_new_nodes = static_cast<int>(new_node_gids.size());
+    output_node.initialize(total_extended_nodes, 3, {node_state::coords});
 
-    // Build map gid→lid
-    std::unordered_map<int,int> node_gid_to_lid;
-    for (int i = 0; i < num_new_nodes; i++)
-        node_gid_to_lid[new_node_gids[i]] = i;
+    // The goal here is to populate output_node.coords using globally gathered ghost node coordinates,
+    // since input_node does not contain ghost node coordinates.
+    //
+    // Each rank will:
+    //  1. Gather coordinates of its owned nodes (from input_node).
+    //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
+    //     into a structure mapping global ID -> coordinate.
+    //  3. Use this map to fill output_node.coords.
 
-    if (print_info)
-        std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n";
+    // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
+    std::vector<size_t> all_needed_node_gids(total_extended_nodes);
+    for (int i = 0; i < total_extended_nodes; i++) {
+        all_needed_node_gids[i] = output_mesh.local_to_global_node_mapping.host(i);
+    }
 
+    // 2. Build owned node GIDs and their coordinates
+    std::vector<size_t> owned_gids(output_mesh.num_owned_nodes);
+    for (int i = 0; i < output_mesh.num_owned_nodes; i++)
+        owned_gids[i] = output_mesh.local_to_global_node_mapping.host(i);
 
-    // -------------- Phase 5: Request node coordinates --------------
-    std::vector<double> node_coords_sendbuf;
-    for (int r = 0; r < world_size; r++) {
-        for (int gid : elems_to_send[r]) {
-            int lid = -1;
-            for (int i = 0; i < naive_mesh.num_elems; i++)
-                if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+    // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
+    //    so we can distribute the needed coordinate data.
+    // The easiest is to Allgather everyone's "owned_gids" and coords
 
-            for (int j = 0; j < nodes_per_elem; j++) {
-                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
-                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+    int local_owned_count = static_cast<int>(owned_gids.size());
+    std::vector<int> owned_counts(world_size, 0);
+    if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1
 
-                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0));
-                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1));
-                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2));
-            }
-        }
+    // a) Gather counts
+    owned_counts.resize(world_size, 0);
+    MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // b) Displacements and total
+    std::vector<int> owned_displs(world_size,0);
+    int total_owned = 0;
+    for (int r = 0; r < world_size; r++) {
+        owned_displs[r] = total_owned;
+        total_owned += owned_counts[r];
     }
 
-    // Each node is 3 doubles; same sendcounts scaling applies
-    std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
-    for (int r = 0; r < world_size; r++)
-        coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
+    // c) Global GIDs (size: total_owned)
+    std::vector<size_t> all_owned_gids(total_owned);
+    MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG,
+                all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
 
-    MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<<std::endl;
 
-    std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
-    int coord_send_total = 0, coord_recv_total = 0;
+    // d) Global coords (size: total_owned x 3)
+    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
+    for (int i = 0; i < local_owned_count; i++) {
+        owned_coords_send[3*i+0] = input_node.coords.host(i,0);
+        owned_coords_send[3*i+1] = input_node.coords.host(i,1);
+        owned_coords_send[3*i+2] = input_node.coords.host(i,2);
+    }
+    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
+
+    // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
+    std::vector<int> coord_counts(world_size);
+    std::vector<int> coord_displs(world_size);
     for (int r = 0; r < world_size; r++) {
-        coord_sdispls[r] = coord_send_total;
-        coord_rdispls[r] = coord_recv_total;
-        coord_send_total += coord_sendcounts[r];
-        coord_recv_total += coord_recvcounts[r];
+        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
+        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
     }
 
-    std::vector<double> coord_recvbuf(coord_recv_total);
-    MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE,
-                coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
+    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
+                all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
+                MPI_DOUBLE, MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
+    // e) Build map: gid -> coord[3]
+    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
+    for (int i = 0; i < total_owned; i++) {
+        std::array<double,3> xyz = {
+            all_owned_coords[3*i+0],
+            all_owned_coords[3*i+1],
+            all_owned_coords[3*i+2]
+        };
+        gid_to_coord[all_owned_gids[i]] = xyz;
+    }
 
-    // -------------- Phase 6: Build the intermediate_mesh --------------
-    intermediate_mesh.initialize_nodes(num_new_nodes);
-    intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
-    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
-    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
+    // 4. Finally, fill output_node.coords with correct coordinates.
+    for (int i = 0; i < total_extended_nodes; i++) {
+        size_t gid = output_mesh.local_to_global_node_mapping.host(i);
+        auto it = gid_to_coord.find(gid);
+        if (it != gid_to_coord.end()) {
+            output_node.coords.host(i,0) = it->second[0];
+            output_node.coords.host(i,1) = it->second[1];
+            output_node.coords.host(i,2) = it->second[2];
+        } else {
+            // Could happen if there's a bug: fill with zeros for safety
+            output_node.coords.host(i,0) = 0.0;
+            output_node.coords.host(i,1) = 0.0;
+            output_node.coords.host(i,2) = 0.0;
+        }
+    }
+    output_node.coords.update_device();
 
-    // Fill global mappings
-    for (int i = 0; i < num_new_nodes; i++)
-        intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
-    for (int i = 0; i < num_new_elems; i++)
-        intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
 
-    intermediate_mesh.local_to_global_node_mapping.update_device();
-    intermediate_mesh.local_to_global_elem_mapping.update_device();
+    // --------------------------------------------------------------------------------------
+    // Build the send patterns for elements
+    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost element GIDs.
+    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(output_mesh.num_owned_elems);
 
+    // Prepare local ghost list as vector
+    std::vector<size_t> ghost_gids_vec;
+    ghost_gids_vec.reserve(output_mesh.num_ghost_elems);
+    for (int i = 0; i < output_mesh.num_ghost_elems; i++) {
+        ghost_gids_vec.push_back(output_mesh.local_to_global_elem_mapping.host(output_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping
+    }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-    // rebuild the local element-node connectivity using the local node ids
-    for(int i = 0; i < intermediate_mesh.num_elems; i++) {
-        for(int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
+    // Exchange counts
+    std::vector<int> ghost_counts(world_size, 0);
+    int local_ghost_count = output_mesh.num_ghost_elems;
+    MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
 
-            int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j];
+    // Displacements and recv buffer
+    std::vector<int> ghost_displs(world_size, 0);
+    int total_ghosts = 0;
+    for (int r = 0; r < world_size; r++) {
+        ghost_displs[r] = total_ghosts;
+        total_ghosts += ghost_counts[r];
+    }
+    std::vector<size_t> all_ghost_gids(total_ghosts);
 
-            int node_lid = -1;
+    // Gather ghost gids
+    MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG,
+                all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
 
-            // Binary search through local_to_global_node_mapping to find the equivalent local index
-            int left = 0, right = num_new_nodes - 1;
-            while (left <= right) {
-                int mid = left + (right - left) / 2;
-                size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid);
-                if (node_gid == mid_gid) {
-                    node_lid = mid;
-                    break;
-                } else if (node_gid < mid_gid) {
-                    right = mid - 1;
-                } else {
-                    left = mid + 1;
-                }
-            }
-            intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
+
+    // Build map gid -> ranks that ghost it
+    std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
+    gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
+    for (int r = 0; r < world_size; r++) {
+        int cnt = ghost_counts[r];
+        int off = ghost_displs[r];
+        for (int i = 0; i < cnt; i++) {
+            size_t g = all_ghost_gids[off + i];
+            gid_to_ghosting_ranks[g].push_back(r);
+        }
+    }
+
+    // For each local element, list destinations: ranks that ghost our gid
+    for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) {
+        size_t local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid);
+        auto it = gid_to_ghosting_ranks.find(local_elem_gid);
+        if (it == gid_to_ghosting_ranks.end()) continue;
+        const std::vector<int> &dest_ranks = it->second;
+        for (int rr : dest_ranks) {
+            if (rr == rank) continue;
+            boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid));
         }
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
 
-    intermediate_mesh.nodes_in_elem.update_device();
+    // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
+    std::vector<int> boundary_elem_local_ids;
+    std::vector<std::vector<int>> boundary_to_ghost_ranks;  // ragged array dimensions (num_boundary_elems, num_ghost_ranks)
 
-    // Fill node coordinates
-    // coord_recvbuf contains coords in element-node order, but we need them in node order
-    // Build a map from node GID to coordinates
-    std::map<int, std::array<double, 3>> node_gid_to_coords;
-    int coord_idx = 0;
-    for (int e = 0; e < intermediate_mesh.num_elems; ++e) {
-        for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
-            int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j];
-            if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
-                node_gid_to_coords[node_gid] = {
-                    coord_recvbuf[coord_idx*3 + 0],
-                    coord_recvbuf[coord_idx*3 + 1],
-                    coord_recvbuf[coord_idx*3 + 2]
-                };
+    std::set<int> ghost_comm_ranks; // set of ranks that this rank communicates with
+
+
+    for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) {
+
+        int local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid);
+        if (boundary_elem_targets[elem_lid].empty()) 
+        {
+            continue;
+        }
+        else
+        {
+            // Fill in vector of boundary local_ids
+            boundary_elem_local_ids.push_back(elem_lid);
+            std::vector<int> ghost_ranks_for_this_boundary_elem;
+            for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                ghost_ranks_for_this_boundary_elem.push_back(pr.first);
+                ghost_comm_ranks.insert(pr.first);
             }
-            coord_idx++;
+            boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem);
         }
     }
-    
-    // Now fill coordinates in node order
-    intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
-    for (int i = 0; i < num_new_nodes; i++) {
-        int node_gid = new_node_gids[i];
-        auto it = node_gid_to_coords.find(node_gid);
-        if (it != node_gid_to_coords.end()) {
-            intermediate_node.coords.host(i, 0) = it->second[0];
-            intermediate_node.coords.host(i, 1) = it->second[1];
-            intermediate_node.coords.host(i, 2) = it->second[2];
-        }
+
+    int num_ghost_comm_ranks = ghost_comm_ranks.size();
+    std::vector<int> ghost_comm_ranks_vec(num_ghost_comm_ranks);
+    int i = 0;
+    for (const auto &r : ghost_comm_ranks) {
+        ghost_comm_ranks_vec[i] = r;
+        i++;
     }
-    intermediate_node.coords.update_device();
 
-    // Connectivity rebuild
-    intermediate_mesh.build_connectivity();
+
     MPI_Barrier(MPI_COMM_WORLD);
 
+    output_mesh.num_boundary_elems = boundary_elem_local_ids.size();
+    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems);
+    for (int i = 0; i < output_mesh.num_boundary_elems; i++) {
+        output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
+    }
+    output_mesh.boundary_elem_local_ids.update_device();
 
+    print_info = false;
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
 
-// ****************************************************************************************** 
-//     Build the ghost elements and nodes
-// ================================================================================================**
-//
-// OVERVIEW OF GHOST ELEMENT IDENTIFICATION:
-// ==========================================
-// In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh
-// elements. However, to perform computations that depend on element neighbors or to maintain
-// consistency at domain boundaries, we need ghost elements: copies of elements from neighboring
-// ranks that share nodes with our locally-owned elements.
-//
-// This algorithm identifies and extracts ghost element data in 5 steps:
-//  1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv)
-//  2. Collect local element-node connectivity for distribution
-//  3. Broadcast connectivity to all ranks (via MPI_Allgatherv)
-//  4. Identify which remote elements touch our local elements
-//  5. Extract the full connectivity data for identified ghost elements
-//
-// KEY DATA STRUCTURES:
-//  - elem_gid_to_rank: Map from element global ID to owning rank
-//  - all_elem_gids: Every element GID from every rank (on every rank)
-//  - all_conn: Flattened (elem_gid, node_gid) pairs from every rank (on every rank)
-//  - ghost_elem_gids: Set of remote element GIDs that are ghosts for this rank
-//  - ghost_elem_to_nodes: Map from ghost element GID to its node GIDs
-//
-// WHY THIS APPROACH?
-// - MPI_Allgatherv is efficient for gathering all data to all ranks
-// - Connectivity pairs allow flexible reconstruction of element-node relationships
-// - Using sets and maps for efficient lookups (O(log n) instead of O(n))
-// - Distributed computation avoids a single bottleneck rank
-//
 
-    double t_ghost_start = MPI_Wtime();
     
-    // ========================================================================
-    // STEP 1: Gather element ownership information from all ranks
-    // ========================================================================
-    // In a distributed mesh, each rank owns a subset of elements. To identify
-    // ghost elements (elements from other ranks needed by this rank), we need
-    // to know which rank owns each element. This section uses MPI collective
-    // operations to gather element GID ownership information.
-    //
-    // MPI COLLECTIVE OPERATIONS EXPLAINED:
-    // ====================================
-    // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point
-    // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank
-    //   Input: Each rank provides local data
-    //   Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...)
-    // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data
-    //   Input: Each rank provides data of potentially different sizes
-    //   Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes
-    //
-    // COMMUNICATION PATTERN VISUALIZATION:
-    // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...]
-    // Rank 1: elem_count[1] /
-    // Rank 2: elem_count[2] /
+    // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
+    // that efficiently represents the communication pattern between ranks.
+    // This allows MPI to optimize communication based on the actual connectivity pattern.
     
-    // MPI_Allgather: Each rank sends its element count, every rank receives
-    // the count from every other rank. Result: elem_counts[r] = number of
-    // elements owned by rank r.
-    std::vector<int> elem_counts(world_size);
-    MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);  // Synchronize all ranks before proceeding
-    
-    // Compute displacements: offset into the global array for each rank's data
-    // Example: if elem_counts = [100, 150, 120], then
-    // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids)
-    std::vector<int> elem_displs(world_size);
-    int total_elems = 0;
-    for (int r = 0; r < world_size; r++) {
-        elem_displs[r] = total_elems;
-        total_elems += elem_counts[r];
-    }
-    
-    // MPI_Allgatherv: Gather variable-sized data from all ranks into one array
-    // Each rank contributes its local_to_global_elem_mapping, which maps
-    // local element indices to global element GIDs. After this call,
-    // all_elem_gids contains ALL element GIDs from all ranks, organized by rank.
-    std::vector<size_t> all_elem_gids(total_elems);
-    MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
-                   all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-    // Build a lookup map: element GID -> owning rank
-    // This allows O(log n) lookups to determine which rank owns any given element.
-    std::map<size_t, int> elem_gid_to_rank;
-    for (int r = 0; r < world_size; r++) {
-        for (int i = 0; i < elem_counts[r]; i++) {
-            size_t gid = all_elem_gids[elem_displs[r] + i];
-            elem_gid_to_rank[gid] = r;
-        }
-    }
-    
-    // ========================================================================
-    // STEP 2: Build element-to-node connectivity for local elements
-    // ========================================================================
-    // Ghost elements are elements from other ranks that share nodes with our
-    // locally-owned elements. To identify them, we need to exchange element-node
-    // connectivity information with all other ranks.
-    
-    // Collect all nodes that belong to our locally-owned elements
-    // This set will be used later to check if a remote element is relevant
-    std::set<size_t> local_elem_nodes;
-    for(int node_rid = 0; node_rid < intermediate_mesh.num_nodes; node_rid++) {
-        size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_rid);
-        local_elem_nodes.insert(node_gid);
-    }
     
-    // ========================================================================
-    // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv
-    // ========================================================================
-    // Build a flattened connectivity array: pairs of (elem_gid, node_gid)
-    // Example for 2 elements with 8 nodes each:
-    //   elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
-    //
-    // This format is chosen because it's easy to serialize and deserialize over MPI,
-    // and allows us to reconstruct the full element-node relationships.
-    std::vector<size_t> elem_node_conn;
-    int local_conn_size = 0;
+    // ---------- Prepare INCOMING edges (sources) ----------
+    // indegree: Number of ranks from which this rank will RECEIVE data
+    // These are the ranks that own elements which are ghosted on this rank
+    std::vector<int> ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), 
+                                                    ghost_elem_receive_ranks.end());
+    // The number of ranks from which this rank will receive data (incoming neighbors)
+    int indegree = static_cast<int>(ghost_elem_receive_ranks_vec.size());
     
-    // For each locally-owned element, record its GID and all its node GIDs
-    for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) {
-        size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid);
-        
-        // Access nodes_in_elem[lid][*] to get all nodes in this element
-        for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);  // Local index
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);  // Global index
-            
-            elem_node_conn.push_back(elem_gid);
-            elem_node_conn.push_back(node_gid);
-        }
-        local_conn_size += nodes_per_elem * 2;  // Each element contributes (num_nodes_in_elem * 2) size_ts
-    }
+    // sources: Array of source rank IDs (ranks we receive from)
+    // Each element corresponds to a rank that owns elements we ghost
+    int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
 
-   
-    
-    // ========================================================================
-    // Perform MPI communication to gather connectivity from all ranks
-    // ========================================================================
-    // Similar to Step 1, we use MPI_Allgatherv to collect all element-node
-    // connectivity pairs. This is a two-stage process:
-    // 1) Gather the size of each rank's connectivity data
-    // 2) Gather the actual connectivity data with proper offsets
     
-    // Stage 1: Gather connectivity sizes from each rank
-    // conn_sizes[r] = number of size_t values that rank r will send
-    std::vector<int> conn_sizes(world_size);
-    MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
+    // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* sourceweights = MPI_UNWEIGHTED;
     
-    // Compute displacements for the second MPI_Allgatherv call
-    // Displcements tell each rank where its data should be placed in the global array
-    std::vector<int> conn_displs(world_size);
-    int total_conn = 0;
-    for (int r = 0; r < world_size; r++) {
-        conn_displs[r] = total_conn;
-        total_conn += conn_sizes[r];
-    }
+    // ---------- Prepare OUTGOING edges (destinations) ----------
+    // outdegree: Number of ranks to which this rank will SEND data
+    // These are the ranks that ghost elements owned by this rank
+    int outdegree = num_ghost_comm_ranks;
     
-    // Stage 2: Gather all element-node connectivity data
-    // After this call, all_conn contains the flattened connectivity from every rank,
-    // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r])
-    std::vector<size_t> all_conn(total_conn);
-    MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
-                   all_conn.data(), conn_sizes.data(), conn_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    // destinations: Array of destination rank IDs (ranks we send to)
+    // Each element corresponds to a rank that ghosts our owned elements
+    int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED;
+
+    // Initialize the graph communicator for element communication
+    element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data());
     MPI_Barrier(MPI_COMM_WORLD);
     
-    // ========================================================================
-    // STEP 4: Identify ghost elements
-    // ========================================================================
-    // A ghost element is an element owned by another rank that shares at least
-    // one node with our locally-owned elements. This step identifies all such elements.
+    // Optional: Verify the graph communicator was created successfully
+    // if(print_info) element_communication_plan.verify_graph_communicator();
+
+    // ****************************************************************************************** 
+//     Build send counts and displacements for element communication
+// ****************************************************************************************** 
+
+ // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
+    // For MPI_Neighbor_alltoallv with graph communicator:
+    //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
+    //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
     
-    // Build a set of locally-owned element GIDs for quick lookup
-    std::set<size_t> local_elem_gids;
-    for (int i = 0; i < intermediate_mesh.num_elems; i++) {
-        local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i));
-    }
+    // std::vector<int> elem_sendcounts(element_communication_plan.num_send_ranks, 0);
+    // std::vector<int> elem_sdispls(element_communication_plan.num_send_ranks, 0);
     
-    // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it
-    // This helps us identify which remote elements are adjacent to our local elements
-    std::map<size_t, std::set<size_t>> node_to_ext_elem;
+    // Count how many boundary elements go to each destination rank
+    // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element
+    std::map<int, std::vector<int>> elems_to_send_by_rank;  // rank -> list of boundary element local IDs
     
-    // Iterate through connectivity data from each rank (except ourselves)
-    for (int r = 0; r < world_size; r++) {
-        if (r == rank) continue;  // Skip our own data - we already know our elements
-        
-        // Parse the connectivity data for rank r
-        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
-        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
-        int num_pairs = conn_sizes[r] / 2;
-        
-        for (int i = 0; i < num_pairs; i++) {
-            // Offset into all_conn for this pair (elem_gid, node_gid)
-            int offset = conn_displs[r] + i * 2;
-            size_t elem_gid = all_conn[offset];
-            size_t node_gid = all_conn[offset + 1];
-            
-            // Check if this node belongs to one of our locally-owned elements
-            if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
-                // Check if this element is NOT owned by us (i.e., it's from another rank)
-                if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
-                    // This is a ghost element for us
-                    node_to_ext_elem[node_gid].insert(elem_gid);
-                }
+    for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) {
+        if (!boundary_elem_targets[elem_lid].empty()) {
+            for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                int dest_rank = pr.first;
+                elems_to_send_by_rank[dest_rank].push_back(elem_lid);
             }
         }
     }
-    
-    // Extract all unique ghost element GIDs
-    // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us)
-    std::set<size_t> ghost_elem_gids;
-    for (const auto& pair : node_to_ext_elem) {
-        for (size_t elem_gid : pair.second) {
-            ghost_elem_gids.insert(elem_gid);
+
+    // Serialize into a DRaggedRightArrayKokkos
+    CArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks);
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
+        strides_array(i) = elems_to_send_by_rank[dest_rank].size();
+    }
+    DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
+
+    // Fill in the data
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
+        for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) {
+            elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j];
         }
     }
+    elems_to_send_by_rank_rr.update_device();
+
     
-    // Additional check: elements that are neighbors of our locally-owned elements
-    // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
-    
-    // for (int lid = 0; lid < num_new_elems; lid++) {
-    //     size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid);
-        
-    //     for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
-    //         size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx);
-            
-    //         if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
-    //             size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid);
-                
-    //             // Check if neighbor is owned by this rank
-    //             auto it = elem_gid_to_rank.find(neighbor_gid);
-    //             if (it != elem_gid_to_rank.end() && it->second != rank) {
-    //                 // Neighbor is owned by another rank - it's a ghost for us
-    //                 std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl;
-    //                 ghost_elem_gids.insert(neighbor_gid);
-    //             }
-    //         }
-    //     }
-    // }
-    
-    // Store the count of ghost elements for later use
-    intermediate_mesh.num_ghost_elems = ghost_elem_gids.size();
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    
-    
-    // ========================================================================
-    // STEP 5: Extract ghost element connectivity
-    // ========================================================================
-    // Now that we know which elements are ghosts, we need to extract their
-    // full node connectivity from all_conn. This allows us to properly construct
-    // the extended mesh with ghost elements included.
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
+    // Count how many ghost elements come from each source rank
+    // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element
+    std::map<int, std::vector<int>> elems_to_recv_by_rank;  // rank -> list of ghost element indices
     
-    // Build a map: ghost_elem_gid -> vector of node_gids
-    // We pre-allocate the vector size to avoid repeated reallocations
-    std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
-    for (const size_t& ghost_gid : ghost_elem_gids) {
-        ghost_elem_to_nodes[ghost_gid].reserve(intermediate_mesh.num_nodes_in_elem);
+    for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) {
+        int source_rank = ghost_elem_owner_ranks[i];
+        int ghost_elem_local_id = output_mesh.num_owned_elems + i;
+        elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id);
     }
-    
-    // ========================================================================
-    // Extract nodes for each ghost element from the globally-collected all_conn
-    // ========================================================================
-    // The all_conn array was populated by MPI_Allgatherv and contains connectivity
-    // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse
-    // this data to extract the nodes for each ghost element.
-    for (int r = 0; r < world_size; r++) {
-        if (r == rank) continue;  // Skip our own data - we already have owned element connectivity
-        
-        // Parse connectivity data for rank r
-        int num_pairs = conn_sizes[r] / 2;
-        
-        for (int i = 0; i < num_pairs; i++) {
-            // Calculate offset for this pair: displacement + (pair_index * 2)
-            int offset = conn_displs[r] + i * 2;
-            size_t elem_gid = all_conn[offset];
-            size_t node_gid = all_conn[offset + 1];
-            
-            // If this element is one of our identified ghost elements, record its node
-            auto it = ghost_elem_to_nodes.find(elem_gid);
-            if (it != ghost_elem_to_nodes.end()) {
-                it->second.push_back(node_gid);
-            }
-        }
+
+    // ========== Serialize into a DRaggedRightArrayKokkos ==========
+    CArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks);
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
+        elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
+       
     }
-    
-    // ========================================================================
-    // Validation: Verify each ghost element has the correct number of nodes
-    // ========================================================================
-    // This catch detects issues in the MPI communication or parsing logic
-    for (auto& pair : ghost_elem_to_nodes) {
-        if (pair.second.size() != static_cast<size_t>(intermediate_mesh.num_nodes_in_elem)) {
-            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
-                      << " has " << pair.second.size() << " nodes, expected " << intermediate_mesh.num_nodes_in_elem << std::endl;
+    DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
+    // Fill in the data
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
+        for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) {
+            elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j];
         }
     }
-    
-    // Step 2: Build extended node list (owned nodes first, then ghost-only nodes)
-    // Start with owned nodes
-    std::map<size_t, int> node_gid_to_extended_lid;
-    int extended_node_lid = 0;
-    
-    // Add all owned nodes
-    for (int i = 0; i < intermediate_mesh.num_nodes; i++) {
-        size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i);
-        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
-    }
-    
-    // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements)
-    std::set<size_t> ghost_only_nodes;
-    for (const auto& pair : ghost_elem_to_nodes) {
-        for (size_t node_gid : pair.second) {
-            // Check if we already have this node
-            if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) {
-                ghost_only_nodes.insert(node_gid);
-            }
-        }
+    elems_to_recv_by_rank_rr.update_device();
+    element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+}
+
+
+/**
+ * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh.
+ *
+ * This function performs parallel mesh partitioning using a two-stage approach:
+ *   1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks).
+ *   2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity.
+ *
+ * The partitioned mesh, nodal data, and associated connectivity/gauss point information
+ * are distributed among MPI ranks as a result. The procedure ensures that each rank receives
+ * its assigned portion of the mesh and associated data in the final (target) decomposition.
+ *
+ * @param initial_mesh[in]  The input (global) mesh, present on rank 0 or all ranks at start.
+ * @param final_mesh[out]   The mesh assigned to this rank after PT-Scotch decomposition.
+ * @param initial_node[in]  Nodal data for the input (global) mesh; must match initial_mesh.
+ * @param final_node[out]   Nodal data for this rank after decomposition (corresponds to final_mesh).
+ * @param gauss_point[out]  Gauss point data structure, filled out for this rank's mesh.
+ * @param world_size[in]    Number of MPI ranks in use (the total number of partitions).
+ * @param rank[in]          This process's MPI rank ID.
+ *
+ * Internals:
+ * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition.
+ * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout.
+ * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information,
+ *   are managed and exchanged across ranks.
+ * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition.
+ */
+
+void partition_mesh(
+    Mesh_t& initial_mesh,
+    Mesh_t& final_mesh,
+    node_t& initial_node,
+    node_t& final_node,
+    GaussPoint_t& gauss_point,
+    int world_size,
+    int rank){
+
+    bool print_info = false;
+    bool print_vtk = false;
+
+    // Create mesh, gauss points, and node data structures on each rank
+    // This is the initial partitioned mesh
+    Mesh_t naive_mesh;
+    node_t naive_node;
+
+    // Mesh partitioned by pt-scotch, not including ghost
+    Mesh_t intermediate_mesh; 
+    node_t intermediate_node;
+
+
+    // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh
+    std::vector<int> elems_in_elem_on_rank;
+    std::vector<int> num_elems_in_elem_per_rank;
+
+
+    // Perform the naive partitioning of the mesh
+    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
+
+
+    /**********************************************************************************
+     * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
+     **********************************************************************************
+     *
+     * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch
+     * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges
+     * correspond to mesh-neighbor relationships (i.e., elements that share a face or are
+     * otherwise neighbors per your mesh definition).
+     *
+     * We use the compact CSR (Compressed Sparse Row) representation, passing only the
+     * essential information required by PT-Scotch.
+     * 
+     * Variables and structures used:
+     *   - SCOTCH_Dgraph dgraph:
+     *       The distributed graph instance managed by PT-Scotch. Each MPI rank creates
+     *       and fills in its portion of the global graph.
+     * 
+     *   - const SCOTCH_Num baseval:
+     *       The base value for vertex and edge numbering. Set to 0 for C-style zero-based
+     *       arrays. Always use 0 unless you are using Fortran style 1-based arrays.
+     * 
+     *   - const SCOTCH_Num vertlocnbr:
+     *       The *number of local vertices* (mesh elements) defined on this MPI rank.
+     *       In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify
+     *       its own local vertex count.
+     *
+     *   - const SCOTCH_Num vertlocmax:
+     *       The *maximum number of local vertices* that could be stored (capacity). We
+     *       allocate with no unused holes, so vertlocmax = vertlocnbr.
+     *
+     *   - std::vector<SCOTCH_Num> vertloctab:
+     *       CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i]
+     *       gives the index in edgeloctab where the neighbor list of vertex i begins.
+     *       PT-Scotch expects this array to be of size vertlocnbr+1, where the difference
+     *       vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i.
+     *
+     *   - std::vector<SCOTCH_Num> edgeloctab:
+     *       CSR array [variable size]: a flattened list of *neighboring element global IDs*,
+     *       in no particular order. For vertex i, its neighbors are located at
+     *       edgeloctab[vertloctab[i]...vertloctab[i+1]-1].
+     *       In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to
+     *       recognize edges both within and across ranks.
+     *
+     *   - std::map<int, size_t> elem_gid_to_offset:
+     *       Helper map: For a given element global ID, gives the starting offset in 
+     *       the flattened neighbor array (elems_in_elem_on_rank) where this element's
+     *       list of neighbor GIDs begins. This allows efficient neighbor list lookup.
+     *
+     *   - (other arrays used, from mesh setup and communication phase)
+     *       - elements_on_rank: vector of global element IDs owned by this rank.
+     *       - num_elements_on_rank: number of owned elements.
+     *       - num_elems_in_elem_per_rank: array, for each owned element, how many
+     *         neighbors it has.
+     *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
+     *
+    **********************************************************************************/
+
+    // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
+    SCOTCH_Dgraph dgraph;
+    if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
+        MPI_Abort(MPI_COMM_WORLD, 1);
     }
-    
-    // Assign extended local IDs to ghost-only nodes
-    for (size_t node_gid : ghost_only_nodes) {
-        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+
+    // Set base value for numbering (0 for C-style arrays)
+    const SCOTCH_Num baseval = 0;
+
+    // vertlocnbr: Number of elements (vertices) that are local to this MPI rank
+    const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(naive_mesh.num_elems);
+
+    // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr)
+    const SCOTCH_Num vertlocmax = vertlocnbr;
+
+    // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) ---
+    // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins
+    std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
+
+    // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
+    std::vector<SCOTCH_Num> edgeloctab;
+    edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance
+
+    // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
+    // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
+    std::map<int, size_t> elem_gid_to_offset;
+    size_t current_offset = 0;
+    for (size_t k = 0; k < naive_mesh.num_elems; k++) {
+        int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
+        elem_gid_to_offset[elem_gid_on_rank] = current_offset;
+        current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH
     }
-    
-    int total_extended_nodes = extended_node_lid;
-    
-    // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
-    // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
-    std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
-    for (size_t node_gid : ghost_only_nodes) {
-        // Find which rank owns an element containing this node
-        // Look through ghost elements
-        for (const auto& pair : ghost_elem_to_nodes) {
-            size_t ghost_elem_gid = pair.first;
-            const std::vector<size_t>& nodes = pair.second;
-            bool found = false;
-            for (size_t ngid : nodes) {
-                if (ngid == node_gid) {
-                    found = true;
-                    break;
-                }
-            }
-            if (found) {
-                auto owner_it = elem_gid_to_rank.find(ghost_elem_gid);
-                if (owner_it != elem_gid_to_rank.end()) {
-                    rank_to_ghost_node_requests[owner_it->second].push_back(node_gid);
-                    break;
-                }
+
+    // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
+    SCOTCH_Num offset = 0; // running count of edges encountered
+
+    for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
+
+        // Record current edge offset for vertex lid in vertloctab
+        vertloctab[lid] = offset;
+
+        // Obtain this local element's global ID (from mapping)
+        int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+
+        // Find offset in the flattened neighbor array for this element's neighbor list
+        size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
+
+        // For this element, find the count of its neighbors
+        // This requires finding its index in the elements_on_rank array
+        size_t idx = 0;
+        for (size_t k = 0; k < naive_mesh.num_elems; k++) {
+            int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
+            if (elem_gid_on_rank == elem_gid) {
+                idx = k;
+                break;
             }
         }
-    }
-    
-    // Step 4: Build extended element list and node connectivity
-    // Owned elements: 0 to num_new_elems-1 (already have these)
-    // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
-    
-    // Create extended element-node connectivity array
-    int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems;
-    std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
-    
-    // Copy owned element connectivity (convert to extended node LIDs)
-    for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) {
-        extended_nodes_in_elem[lid].reserve(nodes_per_elem);
-        for (int j = 0; j < nodes_per_elem; j++) {
-            size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j);
-            size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-            int ext_lid = node_gid_to_extended_lid[node_gid];
-            extended_nodes_in_elem[lid].push_back(ext_lid);
+        size_t num_nbrs = num_elems_in_elem_per_rank[idx];
+
+        // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
+        for (size_t j = 0; j < num_nbrs; j++) {
+            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
+            edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
+            ++offset; // Increment running edge count
         }
     }
-    
-    // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
-    int ghost_elem_ext_lid = intermediate_mesh.num_elems;
-    std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
-    std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
-    
-    for (size_t ghost_gid : ghost_elem_gids_ordered) {
-        auto it = ghost_elem_to_nodes.find(ghost_gid);
-        if (it == ghost_elem_to_nodes.end()) continue;
-        
-        extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem);
-        for (size_t node_gid : it->second) {
-            int ext_lid = node_gid_to_extended_lid[node_gid];
-            extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid);
+
+    // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure
+    vertloctab[vertlocnbr] = offset;
+
+    // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally
+    // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint)
+    const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees)
+    const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints
+
+    // Optionally print graph structure for debugging/validation
+    if (print_info) {
+        std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr
+                  << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl;
+        std::cout << "vertloctab (CSR row offsets): ";
+        for (size_t i = 0; i <= vertlocnbr; i++) {
+            std::cout << vertloctab[i] << " ";
         }
-        ghost_elem_ext_lid++;
-    }
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Sequential rank-wise printing of extended mesh structure info
-    if(print_info) {
-        for (int r = 0; r < world_size; r++) {
-            MPI_Barrier(MPI_COMM_WORLD);
-            if (rank == r) {
-                std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
-                std::cout << "[rank " << rank << "]   - Owned elements: " << intermediate_mesh.num_elems << std::endl;
-                std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
-                std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
-                std::cout << "[rank " << rank << "]   - Owned nodes: " << intermediate_mesh.num_nodes << std::endl;
-                std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
-                std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
-                std::cout << std::flush;
-            }
-            MPI_Barrier(MPI_COMM_WORLD);
+        std::cout << std::endl;
+        std::cout << "edgeloctab (first 20 neighbor GIDs): ";
+        for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
+            std::cout << edgeloctab[i] << " ";
         }
+        std::cout << std::endl;
     }
-    // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
-    // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
-    
-    // Build reverse maps: extended_lid -> gid for nodes and elements
-    std::vector<size_t> extended_lid_to_node_gid(total_extended_nodes);
-    for (const auto& pair : node_gid_to_extended_lid) {
-        extended_lid_to_node_gid[pair.second] = pair.first;
-    }
-    
-    // Build extended element GID list: owned first, then ghost
-    std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
-    // Owned elements
-    for (int i = 0; i < intermediate_mesh.num_elems; i++) {
-        extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /**************************************************************************
+     * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild
+     *
+     *   - PT-Scotch will use our CSR arrays. Since we use compact representation,
+     *     most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab")
+     *     can be passed as nullptr.
+     *   - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this
+     *     to discover connections across processor boundaries, so you do not have to
+     *     encode ownership or partition information yourself.
+     **************************************************************************/
+    int rc = SCOTCH_dgraphBuild(
+                &dgraph,
+                baseval,                // start index (0)
+                vertlocnbr,             // local vertex count (local elements)
+                vertlocmax,             // local vertex max (no holes)
+                vertloctab.data(),      // row offsets in edgeloctab
+                /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr)
+                /*veloloctab*/ nullptr, // vertex weights, not used
+                /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab)
+                edgelocnbr,             // local edge endpoints count
+                edgelocsiz,             // size of edge array
+                edgeloctab.data(),      // global neighbor IDs for each local node
+                /*edgegsttab*/ nullptr, // ghost edge array, not used
+                /*edloloctab*/ nullptr  // edge weights, not used
+    );
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
     }
-    // Ghost elements (in sorted order)
-    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
-        extended_lid_to_elem_gid[intermediate_mesh.num_elems + i] = ghost_elem_gids_ordered[i];
+
+    // Optionally, print rank summary after graph build for further validation
+    if (print_info) {
+        SCOTCH_Num vertlocnbr_out;
+        SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
+        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl;
     }
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    // Build array: for each ghost element, store which rank owns it (where to receive data from)
-    std::vector<int> ghost_elem_owner_ranks(ghost_elem_gids_ordered.size());
-    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
-        size_t ghost_gid = ghost_elem_gids_ordered[i];
-        auto it = elem_gid_to_rank.find(ghost_gid);
-        if (it != elem_gid_to_rank.end()) {
-            ghost_elem_owner_ranks[i] = it->second;
-        } else {
-            std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid 
-                      << " not found in elem_gid_to_rank map!" << std::endl;
-            ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator
-        }
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<<std::endl;
+
+    /********************************************************
+     * Step 5: Validate the graph using SCOTCH_dgraphCheck
+     ********************************************************/
+    rc = SCOTCH_dgraphCheck(&dgraph);
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
     }
 
-    // Create a std::set of all the ranks this rank will receive data from
-    std::set<int> ghost_elem_receive_ranks;
-    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
-        ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]);
-    }
+    /**************************************************************
+     * Step 6: Partition (repartition) the mesh using PT-Scotch
+     * - Each vertex (mesh element) will be assigned a part (mesh chunk).
+     * - Arch is initialized for a complete graph of world_size parts (one per rank).
+     **************************************************************/
+    // SCOTCH_Arch controls the "architecture" for partitioning: the topology
+    // (number and connectivity of parts) to which the graph will be mapped.
+    // The archdat variable encodes this. Below are common options:
+    //
+    // - SCOTCH_archCmplt(&archdat, nbparts)
+    //     * Creates a "complete graph" architecture with nbparts nodes (fully connected).
+    //       Every part is equally distant from every other part.
+    //       This is typically used when minimizing only *balance* and *edge cut*,
+    //       not considering any underlying machine topology.
+    //
+    // - SCOTCH_archHcub(&archdat, dimension)
+    //     * Hypercube architecture (rare in modern use).
+    //       Sets up a hypercube of given dimension.
+    //
+    // - SCOTCH_archTleaf / SCOTCH_archTleafX
+    //     * Tree architectures, for hierarchically structured architectures.
+    //
+    // - SCOTCH_archMesh2 / SCOTCH_archMesh3
+    //     * 2D or 3D mesh topology architectures (useful for grid/matrix machines).
+    //
+    // - SCOTCH_archBuild
+    //     * General: builds any architecture from a descriptor string.
+    //
+    // For distributed mesh partitioning to MPI ranks (where all ranks are equal),
+    // the most common and appropriate is "complete graph" (Cmplt): each part (rank)
+    // is equally reachable from any other (no communication topology bias).
+    SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
+    SCOTCH_archInit(&archdat);
+    // Partition into 'world_size' equally connected parts (each MPI rank is a "node")
+    // Other topology options could be substituted above according to your needs (see docs).
+    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
+
 
-// ****************************************************************************************** 
-//     Build the final partitioned mesh
-// ****************************************************************************************** 
 
+    
+    // ===================== PT-Scotch Strategy Selection and Documentation ======================
+    // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
+    // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
+    //
+    // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation):
+    //
+    // - SCOTCH_STRATDEFAULT:     Use the default (fast, reasonable quality) partitioning strategy.
+    //                            Useful for quick, generic partitions where quality is not critical.
+    //
+    // - SCOTCH_STRATSPEED:       Aggressively maximizes speed (at the cost of cut quality).
+    //                            For large runs or test runs where speed is more important than minimizing edgecut.
+    //
+    // - SCOTCH_STRATQUALITY:     Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance).
+    //                            Slower than the default. Use when high-quality partitioning is desired.
+    //
+    // - SCOTCH_STRATBALANCE:     Tradeoff between speed and quality for balanced workload across partitions.
+    //                            Use if load balance is more critical than cut size.
+    //
+    // Additional Options:
+    // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}").
+    // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic).
+    // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality.
+    //
+    // Example usage:
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01);
+    //      ^ quality-focused, nparts=number of parts/ranks
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05);
+    //      ^ speed-focused, allow 5% imbalance
+    //
+    // Reference:
+    // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf
+    // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation.
+    //
+    // --------------- Set up the desired partitioning strategy here: ---------------
+    SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
+    SCOTCH_stratInit(&stratdat);
+
+    // Select partitioning strategy for this run:
+    // Use SCOTCH_STRATQUALITY for best cut quality.
+    // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
+    // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
+    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001);
 
-    final_mesh.initialize_nodes(total_extended_nodes);
-    final_mesh.initialize_elems(total_extended_elems, 3);
-    final_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
-    final_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
-    for (int i = 0; i < total_extended_nodes; i++) {
-        final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
-    }
-    for (int i = 0; i < total_extended_elems; i++) {
-        final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
+    // partloctab: output array mapping each local element (vertex) to a *target partition number*
+    // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
+    std::vector<SCOTCH_Num> partloctab(vertlocnbr);
+    rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n";
+        SCOTCH_stratExit(&stratdat);
+        SCOTCH_archExit(&archdat);
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
     }
-    final_mesh.local_to_global_node_mapping.update_device();
-    final_mesh.local_to_global_elem_mapping.update_device();
 
-    final_mesh.num_ghost_elems = ghost_elem_gids.size();
-    final_mesh.num_ghost_nodes = ghost_only_nodes.size();
+    // Clean up PT-Scotch strategy and architecture objects
+    SCOTCH_stratExit(&stratdat);
+    SCOTCH_archExit(&archdat);
     
+    // Free the graph now that we have the partition assignments
+    SCOTCH_dgraphFree(&dgraph);
 
-    final_mesh.num_owned_elems = intermediate_mesh.num_elems;
-    final_mesh.num_owned_nodes = intermediate_mesh.num_nodes;
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    // rebuild the local element-node connectivity using the local node ids
-    // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
-    for(int i = 0; i < total_extended_elems; i++) {
-        for(int j = 0; j < nodes_per_elem; j++) {
-            final_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
+    /***************************************************************************
+     * Step 7 (Optional): Print out the partitioning assignment per element
+     * - Each local element's local index lid and global ID (gid) are listed with the
+     *   part to which PT-Scotch has assigned them.
+     ***************************************************************************/
+    print_info = false;
+    for(int rank_id = 0; rank_id < world_size; rank_id++) {
+        if(rank_id == rank && print_info) {
+            for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
+                size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+                std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
+                        << " -> part=" << partloctab[lid] << "\n";
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
         }
+        MPI_Barrier(MPI_COMM_WORLD);
     }
+    print_info = false;
 
-    MPI_Barrier(MPI_COMM_WORLD);
+// ****************************************************************************************** 
+//     Build the intermediate mesh (without ghost nodes and elements) from the repartition
+// ****************************************************************************************** 
 
-    double t_ghost_end = MPI_Wtime();
-    
-    if (rank == 0) {
-        std::cout << " Finished calculating ghost elements" << std::endl;
-        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
-    }
 
-    final_mesh.nodes_in_elem.update_device();
-    final_mesh.build_connectivity();
 
     MPI_Barrier(MPI_COMM_WORLD);
-    
-    if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
+    if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n";
     MPI_Barrier(MPI_COMM_WORLD);
 
-// ****************************************************************************************** 
-//     Build the final nodes that include ghost
-// ****************************************************************************************** 
-
-
-    final_node.initialize(total_extended_nodes, 3, {node_state::coords});
-    
-    // The goal here is to populate final_node.coords using globally gathered ghost node coordinates,
-    // since intermediate_node does not contain ghost node coordinates.
-    //
-    // Each rank will:
-    //  1. Gather coordinates of its owned nodes (from intermediate_node).
-    //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
-    //     into a structure mapping global ID -> coordinate.
-    //  3. Use this map to fill final_node.coords.
-
-    // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
-    std::vector<size_t> all_needed_node_gids(total_extended_nodes);
-    for (int i = 0; i < total_extended_nodes; i++) {
-        all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
+    // -------------- Phase 1: Determine elements to send to each rank --------------
+    std::vector<std::vector<int>> elems_to_send(world_size);
+    for (int lid = 0; lid < naive_mesh.num_elems; lid++) {
+        int dest = static_cast<int>(partloctab[lid]);
+        int elem_gid = static_cast<int>(naive_mesh.local_to_global_elem_mapping.host(lid));
+        elems_to_send[dest].push_back(elem_gid);
     }
 
-    // 2. Build owned node GIDs and their coordinates
-    std::vector<size_t> owned_gids(final_mesh.num_owned_nodes);
-    for (int i = 0; i < final_mesh.num_owned_nodes; i++)
-        owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i);
-
-     // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
-     //    so we can distribute the needed coordinate data.
-     // The easiest is to Allgather everyone's "owned_gids" and coords
- 
-     int local_owned_count = static_cast<int>(owned_gids.size());
-     std::vector<int> owned_counts(world_size, 0);
-     if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1
+    // -------------- Phase 2: Exchange element GIDs --------------
+    std::vector<int> sendcounts(world_size), recvcounts(world_size);
+    for (int r = 0; r < world_size; r++)
+        sendcounts[r] = static_cast<int>(elems_to_send[r].size());
 
-    // a) Gather counts
-    owned_counts.resize(world_size, 0);
-    MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
 
-    // b) Displacements and total
-    std::vector<int> owned_displs(world_size,0);
-    int total_owned = 0;
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Compute displacements
+    std::vector<int> sdispls(world_size), rdispls(world_size);
+    int send_total = 0, recv_total = 0;
     for (int r = 0; r < world_size; r++) {
-        owned_displs[r] = total_owned;
-        total_owned += owned_counts[r];
+        sdispls[r] = send_total;
+        rdispls[r] = recv_total;
+        send_total += sendcounts[r];
+        recv_total += recvcounts[r];
     }
 
-    // c) Global GIDs (size: total_owned)
-    std::vector<size_t> all_owned_gids(total_owned);
-    MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG,
-                   all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-
 
-    // d) Global coords (size: total_owned x 3)
-    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
-    for (int i = 0; i < local_owned_count; i++) {
-        owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0);
-        owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1);
-        owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2);
-    }
-    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
+    // Flatten send buffer
+    // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks.
+    // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning.
+    std::vector<int> send_elems;
+    send_elems.reserve(send_total);
+    for (int r = 0; r < world_size; r++)
+        send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end());
 
-    // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
-    std::vector<int> coord_counts(world_size);
-    std::vector<int> coord_displs(world_size);
-    for (int r = 0; r < world_size; r++) {
-        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
-        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
+    // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange.
+    // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank.
+    std::vector<int> new_elem_gids(recv_total);
+    MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT,
+                new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    // New elements owned by this rank
+    int num_new_elems = static_cast<int>(new_elem_gids.size());
+    
+    if (print_info) {
+        std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
     }
 
-    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
-                   all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
-                   MPI_DOUBLE, MPI_COMM_WORLD);
+    // -------------- Phase 3: Send element–node connectivity --------------
+    int nodes_per_elem = naive_mesh.num_nodes_in_elem;
 
-    // e) Build map: gid -> coord[3]
-    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
-    for (int i = 0; i < total_owned; i++) {
-        std::array<double,3> xyz = {
-            all_owned_coords[3*i+0],
-            all_owned_coords[3*i+1],
-            all_owned_coords[3*i+2]
-        };
-         gid_to_coord[all_owned_gids[i]] = xyz;
-    }
+    // Flatten element-node connectivity by global node IDs
+    std::vector<int> conn_sendbuf;
+    for (int r = 0; r < world_size; r++) {
+        for (int elem_gid : elems_to_send[r]) {
+            // find local element lid from elem_gid
+            int lid = -1;
+            for (int i = 0; i < naive_mesh.num_elems; i++)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; }
 
-    // 4. Finally, fill final_node.coords with correct coordinates.
-    for (int i = 0; i < total_extended_nodes; i++) {
-        size_t gid = final_mesh.local_to_global_node_mapping.host(i);
-        auto it = gid_to_coord.find(gid);
-        if (it != gid_to_coord.end()) {
-            final_node.coords.host(i,0) = it->second[0];
-            final_node.coords.host(i,1) = it->second[1];
-            final_node.coords.host(i,2) = it->second[2];
-        } else {
-            // Could happen if there's a bug: fill with zeros for safety
-            final_node.coords.host(i,0) = 0.0;
-            final_node.coords.host(i,1) = 0.0;
-            final_node.coords.host(i,2) = 0.0;
+            for (int j = 0; j < nodes_per_elem; j++) {
+                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
+                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+                conn_sendbuf.push_back(node_gid);
+            }
         }
     }
-    final_node.coords.update_device();
-
-
-    // --------------------------------------------------------------------------------------
-    // Build the send patterns for elements
-    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
-    // Steps:
-    // 1) Each rank contributes its ghost element GIDs.
-    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
-    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
-    // --------------------------------------------------------------------------------------
-    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(final_mesh.num_owned_elems);
 
-    // Prepare local ghost list as vector
-    std::vector<size_t> ghost_gids_vec;
-    ghost_gids_vec.reserve(final_mesh.num_ghost_elems);
-    for (int i = 0; i < final_mesh.num_ghost_elems; i++) {
-        ghost_gids_vec.push_back(final_mesh.local_to_global_elem_mapping.host(final_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping
-    }
+    // element-node connectivity counts (ints per dest rank)
+    std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
+    for (int r = 0; r < world_size; r++)
+        conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
 
-    // Exchange counts
-    std::vector<int> ghost_counts(world_size, 0);
-    int local_ghost_count = final_mesh.num_ghost_elems;
-    MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
 
-    // Displacements and recv buffer
-    std::vector<int> ghost_displs(world_size, 0);
-    int total_ghosts = 0;
-    for (int r = 0; r < world_size; r++) {
-        ghost_displs[r] = total_ghosts;
-        total_ghosts += ghost_counts[r];
-    }
-    std::vector<size_t> all_ghost_gids(total_ghosts);
 
-    // Gather ghost gids
-    MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG,
-                   all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
-                   MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"<<std::endl;
 
-    
-    // Build map gid -> ranks that ghost it
-    std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
-    gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
+    std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
+    int conn_send_total = 0, conn_recv_total = 0;
     for (int r = 0; r < world_size; r++) {
-        int cnt = ghost_counts[r];
-        int off = ghost_displs[r];
-        for (int i = 0; i < cnt; i++) {
-            size_t g = all_ghost_gids[off + i];
-            gid_to_ghosting_ranks[g].push_back(r);
-        }
+        conn_sdispls[r] = conn_send_total;
+        conn_rdispls[r] = conn_recv_total;
+        conn_send_total += conn_sendcounts[r];
+        conn_recv_total += conn_recvcounts[r];
     }
 
-    // For each local element, list destinations: ranks that ghost our gid
-    for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) {
-        size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
-        auto it = gid_to_ghosting_ranks.find(local_elem_gid);
-        if (it == gid_to_ghosting_ranks.end()) continue;
-        const std::vector<int> &dest_ranks = it->second;
-        for (int rr : dest_ranks) {
-            if (rr == rank) continue;
-            boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid));
-        }
-    }
+    std::vector<int> conn_recvbuf(conn_recv_total);
+    MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT,
+                conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
-    
-    // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
-    std::vector<int> boundary_elem_local_ids;
-    std::vector<std::vector<int>> boundary_to_ghost_ranks;  // ragged array dimensions (num_boundary_elems, num_ghost_ranks)
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"<<std::endl;
 
-    std::set<int> ghost_comm_ranks; // set of ranks that this rank communicates with
-    
+    // -------------- Phase 4: Build new node list (unique GIDs) --------------
+    std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
+    std::vector<int> new_node_gids(node_gid_set.begin(), node_gid_set.end());
+    int num_new_nodes = static_cast<int>(new_node_gids.size());
 
-    for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) {
+    // Build map gid→lid
+    std::unordered_map<int,int> node_gid_to_lid;
+    for (int i = 0; i < num_new_nodes; i++)
+        node_gid_to_lid[new_node_gids[i]] = i;
 
-        int local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid);
-        if (boundary_elem_targets[elem_lid].empty()) 
-        {
-            continue;
-        }
-        else
-        {
-            // Fill in vector of boundary local_ids
-            boundary_elem_local_ids.push_back(elem_lid);
-            std::vector<int> ghost_ranks_for_this_boundary_elem;
-            for (const auto &pr : boundary_elem_targets[elem_lid]) {
-                ghost_ranks_for_this_boundary_elem.push_back(pr.first);
-                ghost_comm_ranks.insert(pr.first);
+    if (print_info)
+        std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n";
+
+
+    // -------------- Phase 5: Request node coordinates --------------
+    std::vector<double> node_coords_sendbuf;
+    for (int r = 0; r < world_size; r++) {
+        for (int gid : elems_to_send[r]) {
+            int lid = -1;
+            for (int i = 0; i < naive_mesh.num_elems; i++)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+
+            for (int j = 0; j < nodes_per_elem; j++) {
+                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
+                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+
+                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0));
+                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1));
+                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2));
             }
-            boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem);
         }
     }
 
-    int num_ghost_comm_ranks = ghost_comm_ranks.size();
-    std::vector<int> ghost_comm_ranks_vec(num_ghost_comm_ranks);
-    int i = 0;
-    for (const auto &r : ghost_comm_ranks) {
-        ghost_comm_ranks_vec[i] = r;
-        i++;
-    }
-
+    // Each node is 3 doubles; same sendcounts scaling applies
+    std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
+    for (int r = 0; r < world_size; r++)
+        coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
 
+    MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<<std::endl;
 
-    final_mesh.num_boundary_elems = boundary_elem_local_ids.size();
-    final_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(final_mesh.num_boundary_elems);
-    for (int i = 0; i < final_mesh.num_boundary_elems; i++) {
-        final_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
+    std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
+    int coord_send_total = 0, coord_recv_total = 0;
+    for (int r = 0; r < world_size; r++) {
+        coord_sdispls[r] = coord_send_total;
+        coord_rdispls[r] = coord_recv_total;
+        coord_send_total += coord_sendcounts[r];
+        coord_recv_total += coord_recvcounts[r];
     }
-    final_mesh.boundary_elem_local_ids.update_device();
 
-    print_info = false;
+    std::vector<double> coord_recvbuf(coord_recv_total);
+    MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE,
+                coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
 
-    
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
 
+    // -------------- Phase 6: Build the intermediate_mesh --------------
+    intermediate_mesh.initialize_nodes(num_new_nodes);
+    intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
 
-// ****************************************************************************************** 
-//     Create Communication Plan for element communication
-// ****************************************************************************************** 
-
+    // Fill global mappings
+    for (int i = 0; i < num_new_nodes; i++)
+        intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
+    for (int i = 0; i < num_new_elems; i++)
+        intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
 
-    CommunicationPlan element_communication_plan;
-    element_communication_plan.initialize(MPI_COMM_WORLD);
-    // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
-    // that efficiently represents the communication pattern between ranks.
-    // This allows MPI to optimize communication based on the actual connectivity pattern.
-    
-    
-    // ---------- Prepare INCOMING edges (sources) ----------
-    // indegree: Number of ranks from which this rank will RECEIVE data
-    // These are the ranks that own elements which are ghosted on this rank
-    std::vector<int> ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), 
-                                                    ghost_elem_receive_ranks.end());
-    // The number of ranks from which this rank will receive data (incoming neighbors)
-    int indegree = static_cast<int>(ghost_elem_receive_ranks_vec.size());
-    
-    // sources: Array of source rank IDs (ranks we receive from)
-    // Each element corresponds to a rank that owns elements we ghost
-    int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+    intermediate_mesh.local_to_global_node_mapping.update_device();
+    intermediate_mesh.local_to_global_elem_mapping.update_device();
 
-    
-    // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
-    // Could be used to specify communication volume if needed for optimization
-    int* sourceweights = MPI_UNWEIGHTED;
-    
-    // ---------- Prepare OUTGOING edges (destinations) ----------
-    // outdegree: Number of ranks to which this rank will SEND data
-    // These are the ranks that ghost elements owned by this rank
-    int outdegree = num_ghost_comm_ranks;
-    
-    // destinations: Array of destination rank IDs (ranks we send to)
-    // Each element corresponds to a rank that ghosts our owned elements
-    int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED;
 
-    // Initialize the graph communicator for element communication
-    element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data());
     MPI_Barrier(MPI_COMM_WORLD);
-    // Optional: Verify the graph communicator was created successfully
-    // if(print_info) element_communication_plan.verify_graph_communicator();
+    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < intermediate_mesh.num_elems; i++) {
+        for(int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
 
+            int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j];
 
-// ****************************************************************************************** 
-//     Build send counts and displacements for element communication
-// ****************************************************************************************** 
+            int node_lid = -1;
 
- // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
-    // For MPI_Neighbor_alltoallv with graph communicator:
-    //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
-    //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
-    
-    // std::vector<int> elem_sendcounts(element_communication_plan.num_send_ranks, 0);
-    // std::vector<int> elem_sdispls(element_communication_plan.num_send_ranks, 0);
-    
-    // Count how many boundary elements go to each destination rank
-    // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element
-    std::map<int, std::vector<int>> elems_to_send_by_rank;  // rank -> list of boundary element local IDs
-    
-    for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) {
-        if (!boundary_elem_targets[elem_lid].empty()) {
-            for (const auto &pr : boundary_elem_targets[elem_lid]) {
-                int dest_rank = pr.first;
-                elems_to_send_by_rank[dest_rank].push_back(elem_lid);
+            // Binary search through local_to_global_node_mapping to find the equivalent local index
+            int left = 0, right = num_new_nodes - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
+                    break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
+                }
             }
+            intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
         }
     }
 
-    // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks);
-    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
-        int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        strides_array(i) = elems_to_send_by_rank[dest_rank].size();
-    }
-    DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
 
-    // Fill in the data
-    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
-        int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) {
-            elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j];
+    intermediate_mesh.nodes_in_elem.update_device();
+
+    // Fill node coordinates
+    // coord_recvbuf contains coords in element-node order, but we need them in node order
+    // Build a map from node GID to coordinates
+    std::map<int, std::array<double, 3>> node_gid_to_coords;
+    int coord_idx = 0;
+    for (int e = 0; e < intermediate_mesh.num_elems; ++e) {
+        for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
+            int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j];
+            if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
+                node_gid_to_coords[node_gid] = {
+                    coord_recvbuf[coord_idx*3 + 0],
+                    coord_recvbuf[coord_idx*3 + 1],
+                    coord_recvbuf[coord_idx*3 + 2]
+                };
+            }
+            coord_idx++;
         }
     }
-    elems_to_send_by_rank_rr.update_device();
-
-    
-    // Count how many ghost elements come from each source rank
-    // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element
-    std::map<int, std::vector<int>> elems_to_recv_by_rank;  // rank -> list of ghost element indices
     
-    for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) {
-        int source_rank = ghost_elem_owner_ranks[i];
-        int ghost_elem_local_id = final_mesh.num_owned_elems + i;
-        elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id);
-    }
-
-    // ========== Serialize into a DRaggedRightArrayKokkos ==========
-    CArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks);
-    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
-        int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
-       
-    }
-    DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
-    // Fill in the data
-    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
-        int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) {
-            elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j];
+    // Now fill coordinates in node order
+    intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
+    for (int i = 0; i < num_new_nodes; i++) {
+        int node_gid = new_node_gids[i];
+        auto it = node_gid_to_coords.find(node_gid);
+        if (it != node_gid_to_coords.end()) {
+            intermediate_node.coords.host(i, 0) = it->second[0];
+            intermediate_node.coords.host(i, 1) = it->second[1];
+            intermediate_node.coords.host(i, 2) = it->second[2];
         }
     }
-    elems_to_recv_by_rank_rr.update_device();
-    element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
+    intermediate_node.coords.update_device();
 
+    // Connectivity rebuild
+    intermediate_mesh.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
+
+    CommunicationPlan element_communication_plan;
+    element_communication_plan.initialize(MPI_COMM_WORLD);
+
+
+    build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, world_size, rank);
+
+
     
 // ****************************************************************************************** 
 //     Test element communication using MPI_Neighbor_alltoallv
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 88727e2e..6551fd06 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -60,7 +60,7 @@ int main(int argc, char** argv) {
 
 
         // Read the mesh from a file
-        // read_vtk_mesh(initial_mesh, initial_node, 3, "meshes/buste.vtk");
+        read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk");
 
         double t_init_mesh_end = MPI_Wtime();
         std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl;

From 20261c972c9da67c9aed95dd9d0905fb41a68116 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 17 Nov 2025 16:08:37 -0600
Subject: [PATCH 32/52] ENH: WIP, adding nodal comms.

---
 examples/mesh_decomp/decomp_utils.h  | 276 ++++++++++++---------------
 examples/mesh_decomp/mesh_decomp.cpp |   2 +-
 examples/mesh_decomp/mesh_io.h       |  14 +-
 examples/mesh_decomp/state.h         |  63 +++++-
 src/include/communication_plan.h     | 185 +++++++++---------
 src/include/mpi_types.h              |   2 +-
 6 files changed, 286 insertions(+), 256 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 5cdf4a6f..62986e6e 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -9,6 +9,7 @@
 #include <mpi.h>
 #include <set>
 #include <map>
+#include <unordered_set>
 
 
 #include "mesh.h"
@@ -552,12 +553,40 @@ void naive_partition_mesh(
     return;
 }
 
+/// @brief Builds ghost elements and nodes for distributed mesh decomposition.
+///
+/// In distributed memory parallel computing with MPI, each rank owns a subset of the mesh.
+/// Ghost elements and nodes are copies of elements/nodes from neighboring ranks that share
+/// nodes with the locally-owned elements. This function identifies and extracts these ghost
+/// entities to enable inter-rank communication and maintain consistency at domain boundaries.
+///
+/// The algorithm operates in 5 primary steps:
+///  1. Gather element ownership information from all ranks using MPI_Allgatherv
+///  2. Collect local element-node connectivity for distribution
+///  3. Broadcast connectivity information to all ranks via MPI collective operations
+///  4. Identify which remote elements touch local elements (by shared nodes)
+///  5. Extract the full connectivity data for identified ghost elements and their nodes
+///
+/// @param[in] input_mesh The locally-owned mesh on this rank containing local elements/nodes
+/// @param[out] output_mesh The enriched mesh with ghost elements and nodes added to local mesh
+/// @param[in] input_node Node data associated with the input mesh
+/// @param[out] output_node Node data extended with ghost nodes
+/// @param[in,out] element_communication_plan MPI communication plan specifying which ranks
+///                                            exchange element data (populated by this function)
+/// @param[in] world_size Total number of MPI ranks
+/// @param[in] rank Current MPI rank (process ID)
+///
+/// @note This is a collective MPI operation - all ranks must call this function together.
+/// @note Uses data-oriented programming patterns with device-accessible arrays (MATAR containers)
+/// @note Performance: O(n_local_elements * n_nodes_per_element) for local operations,
+///                    plus O(n_global_elements) for global MPI collective operations
 void build_ghost(
     Mesh_t& input_mesh,
     Mesh_t& output_mesh,
     node_t& input_node,
     node_t& output_node,
     CommunicationPlan& element_communication_plan,
+    CommunicationPlan& node_communication_plan,
     int world_size,
     int rank)
 {
@@ -644,18 +673,18 @@ void build_ghost(
     }
 
     // ========================================================================
-    // STEP 2: Build element-to-node connectivity for local elements
+    // STEP 2: Build index sets for local elements and nodes
     // ========================================================================
-    // Ghost elements are elements from other ranks that share nodes with our
-    // locally-owned elements. To identify them, we need to exchange element-node
-    // connectivity information with all other ranks.
-
-    // Collect all nodes that belong to our locally-owned elements
-    // This set will be used later to check if a remote element is relevant
-    std::set<size_t> local_elem_nodes;
+    std::set<size_t> local_node_gids;
     for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) {
         size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid);
-        local_elem_nodes.insert(node_gid);
+        local_node_gids.insert(node_gid);
+    }
+
+    // Build a set of locally-owned element GIDs for quick lookup
+    std::set<size_t> local_elem_gids;
+    for (int i = 0; i < input_mesh.num_elems; i++) {
+        local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i));
     }
 
     // ========================================================================
@@ -725,15 +754,12 @@ void build_ghost(
     // A ghost element is an element owned by another rank that shares at least
     // one node with our locally-owned elements. This step identifies all such elements.
 
-    // Build a set of locally-owned element GIDs for quick lookup
-    std::set<size_t> local_elem_gids;
-    for (int i = 0; i < input_mesh.num_elems; i++) {
-        local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i));
-    }
+    
+    // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us)
+    std::set<size_t> ghost_elem_gids;
+    std::set<size_t> ghost_node_gids;
 
-    // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it
-    // This helps us identify which remote elements are adjacent to our local elements
-    std::map<size_t, std::set<size_t>> node_to_ext_elem;
+    std::map<size_t, int> ghost_node_recv_rank;
 
     // Iterate through connectivity data from each rank (except ourselves)
     for (int r = 0; r < world_size; r++) {
@@ -751,51 +777,21 @@ void build_ghost(
             size_t node_gid = all_conn[offset + 1];
             
             // Check if this node belongs to one of our locally-owned elements
-            if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) {
+            if (local_node_gids.find(node_gid) != local_node_gids.end()) {
+                ghost_node_gids.insert(node_gid);
+                ghost_node_recv_rank[node_gid] = r;
                 // Check if this element is NOT owned by us (i.e., it's from another rank)
                 if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
                     // This is a ghost element for us
-                    node_to_ext_elem[node_gid].insert(elem_gid);
+                    ghost_elem_gids.insert(elem_gid);
                 }
             }
         }
     }
 
-    // Extract all unique ghost element GIDs
-    // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us)
-    std::set<size_t> ghost_elem_gids;
-    for (const auto& pair : node_to_ext_elem) {
-        for (size_t elem_gid : pair.second) {
-            ghost_elem_gids.insert(elem_gid);
-        }
-    }
-
-    // Additional check: elements that are neighbors of our locally-owned elements
-    // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity)
-
-    // for (int lid = 0; lid < num_new_elems; lid++) {
-    //     size_t num_neighbors = input_mesh.num_elems_in_elem(lid);
-        
-    //     for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) {
-    //         size_t neighbor_lid = input_mesh.elems_in_elem(lid, nbr_idx);
-            
-    //         if (neighbor_lid < static_cast<size_t>(num_new_elems)) {
-    //             size_t neighbor_gid = input_mesh.local_to_global_elem_mapping(neighbor_lid);
-                
-    //             // Check if neighbor is owned by this rank
-    //             auto it = elem_gid_to_rank.find(neighbor_gid);
-    //             if (it != elem_gid_to_rank.end() && it->second != rank) {
-    //                 // Neighbor is owned by another rank - it's a ghost for us
-    //                 std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl;
-    //                 ghost_elem_gids.insert(neighbor_gid);
-    //             }
-    //         }
-    //     }
-    // }
-
     // Store the count of ghost elements for later use
     input_mesh.num_ghost_elems = ghost_elem_gids.size();
-
+    input_mesh.num_ghost_nodes = ghost_node_gids.size();
     MPI_Barrier(MPI_COMM_WORLD);
 
 
@@ -1025,7 +1021,6 @@ void build_ghost(
     output_mesh.num_ghost_elems = ghost_elem_gids.size();
     output_mesh.num_ghost_nodes = ghost_only_nodes.size();
 
-
     output_mesh.num_owned_elems = input_mesh.num_elems;
     output_mesh.num_owned_nodes = input_mesh.num_nodes;
 
@@ -1108,6 +1103,16 @@ void build_ghost(
                 all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
                 MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
 
+    // Map node gid -> owning rank
+    std::unordered_map<size_t, int> node_gid_to_owner_rank;
+    int owner_offset = 0;
+    for (int r = 0; r < world_size; r++) {
+        for (int i = 0; i < owned_counts[r]; i++) {
+            node_gid_to_owner_rank[all_owned_gids[owner_offset + i]] = r;
+        }
+        owner_offset += owned_counts[r];
+    }
+
 
     // d) Global coords (size: total_owned x 3)
     std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
@@ -1312,7 +1317,7 @@ void build_ghost(
     // Optional: Verify the graph communicator was created successfully
     // if(print_info) element_communication_plan.verify_graph_communicator();
 
-    // ****************************************************************************************** 
+// ****************************************************************************************** 
 //     Build send counts and displacements for element communication
 // ****************************************************************************************** 
 
@@ -1385,6 +1390,18 @@ void build_ghost(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    // --------------------------------------------------------------------------------------
+    // Build the send pattern for nodes
+    // --------------------------------------------------------------------------------------
+    // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost node GIDs.
+    // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    
+   
+
 
 }
 
@@ -2023,10 +2040,13 @@ void partition_mesh(
 
     CommunicationPlan element_communication_plan;
     element_communication_plan.initialize(MPI_COMM_WORLD);
+    
+    
+    CommunicationPlan node_communication_plan;
+    node_communication_plan.initialize(MPI_COMM_WORLD);
 
-
-    build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, world_size, rank);
-
+    build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
 
     
 // ****************************************************************************************** 
@@ -2082,108 +2102,48 @@ void partition_mesh(
 
 
 
-    // --------------------------------------------------------------------------------------
-    // TODO: Build the send pattern for nodes --------------------------------------------------------------------------------------
-    // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
-    // Steps:
-    // 1) Each rank contributes its ghost node GIDs.
-    // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
-    // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
-    // --------------------------------------------------------------------------------------
-    
-    // std::vector<std::vector<std::pair<int, size_t>>> boundary_node_targets(intermediate_mesh.num_nodes);
-    
-    // // Prepare local ghost node list as vector
-    // std::vector<size_t> ghost_node_gids_vec;
-    // ghost_node_gids_vec.reserve(ghost_only_nodes.size());
-    // for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g);
-    
-    // // Exchange counts
-    // std::vector<int> ghost_node_counts(world_size, 0);
-    // int local_ghost_node_count = static_cast<int>(ghost_node_gids_vec.size());
-    // MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    
-    // // Displacements and recv buffer
-    // std::vector<int> ghost_node_displs(world_size, 0);
-    // int total_ghost_nodes = 0;
-    // for (int r = 0; r < world_size; r++) {
-    //     ghost_node_displs[r] = total_ghost_nodes;
-    //     total_ghost_nodes += ghost_node_counts[r];
-    // }
-    // std::vector<size_t> all_ghost_node_gids(total_ghost_nodes);
-    
-    // // Gather ghost node gids
-    // MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG,
-    //                all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(),
-    //                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
-    
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl;
-    
-    
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl;
+    // Test node communication using MPI_Neighbor_alltoallv
+    std::vector<node_state> node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field};
+    final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan);
     
-    // // Build map node_gid -> ranks that ghost it
-    // std::unordered_map<size_t, std::vector<int>> node_gid_to_ghosting_ranks;
-    // node_gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghost_nodes));
-    // for (int r = 0; r < world_size; r++) {
-    //     int cnt = ghost_node_counts[r];
-    //     int off = ghost_node_displs[r];
-    //     for (int i = 0; i < cnt; i++) {
-    //         size_t g = all_ghost_node_gids[off + i];
-    //         node_gid_to_ghosting_ranks[g].push_back(r);
-    //     }
-    // }
-    
-    // // For each local node, list destinations: ranks that ghost our node gid
-    // for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
-    //     size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-    //     auto it = node_gid_to_ghosting_ranks.find(local_node_gid);
-    //     if (it == node_gid_to_ghosting_ranks.end()) continue;
-    //     const std::vector<int> &dest_ranks = it->second;
-    //     for (int rr : dest_ranks) {
-    //         if (rr == rank) continue;
-    //         boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid));
-    //     }
-    // }
-    
-    // std::cout.flush();
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // print_info = false;
-    
-    // // Optional: print a compact summary of node reverse map for verification (limited output)
-    // for(int i = 0; i < world_size; i++) {
-    //     if (rank == i && print_info) {
-    //         std::cout << std::endl;
-    //         for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) {
-                
-    //             size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid);
-    //             if (boundary_node_targets[node_lid].empty()) 
-    //             {
-    //                 std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl;
-    //             }
-    //             else
-    //             {
-    //                 std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" -  node_gid: " << local_node_gid << " sends to: ";
-    //                 int shown = 0;
-    //                 for (const auto &pr : boundary_node_targets[node_lid]) {
-    //                     if (shown >= 12) { std::cout << " ..."; break; }
-    //                     std::cout << "(r" << pr.first << ":gid " << pr.second << ") ";
-    //                     shown++;
-    //                 }
-    //                 std::cout << std::endl;
-    //             }
-    //         }
-    //         std::cout.flush();
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-    
-    // print_info = false;
-    
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl;
+    for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
+        final_node.scalar_field.host(i) = static_cast<double>(rank);
+        final_node.vector_field.host(i, 0) = static_cast<double>(rank);
+        final_node.vector_field.host(i, 1) = static_cast<double>(rank);
+        final_node.vector_field.host(i, 2) = static_cast<double>(rank);
+    }
+    for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
+        final_node.scalar_field.host(i) = -1.0;
+        final_node.vector_field.host(i, 0) = -1.0;
+        final_node.vector_field.host(i, 1) = -1.0;
+        final_node.vector_field.host(i, 2) = -1.0;
+    }
+
+    final_node.coords.update_device();
+    final_node.scalar_field.update_device();
+    final_node.vector_field.update_device();
+
+    final_node.scalar_field.communicate();
+    final_node.vector_field.communicate();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    // Update scalar field to visualize the communication
+
+    for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
+        double value = 0.0;
+        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
+            value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j));
+        }
+        value /= final_mesh.num_nodes_in_elem;
+
+        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
+            final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
+        }
+    }
+
+
+   
 
 }
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 6551fd06..2113e9d6 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -60,7 +60,7 @@ int main(int argc, char** argv) {
 
 
         // Read the mesh from a file
-        read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk");
+        // read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk");
 
         double t_init_mesh_end = MPI_Wtime();
         std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl;
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index e6fc65de..c9a75a0f 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -543,8 +543,8 @@ void write_vtu(Mesh_t& mesh,
     const int num_cell_vec_vars    = 1;
     const int num_cell_tensor_vars = 0;
 
-    const int num_point_scalar_vars = 3;
-    const int num_point_vec_vars = 1;
+    const int num_point_scalar_vars = 4;
+    const int num_point_vec_vars = 2;
 
     // Scalar values associated with a cell
     const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
@@ -556,11 +556,11 @@ void write_vtu(Mesh_t& mesh,
     };
 
     const char point_scalar_var_names[num_point_scalar_vars][15] = {
-        "rank_id", "elems_in_node", "global_node_id"
+        "rank_id", "elems_in_node", "global_node_id", "scalar_field"
     };
 
     const char point_vec_var_names[num_point_vec_vars][15] = {
-        "pos"
+        "pos", "vector_field"
     };
 
     // short hand
@@ -592,9 +592,15 @@ void write_vtu(Mesh_t& mesh,
         vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1);
         vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
 
+        // vector field, var 1
+        vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0);
+        vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1);
+        vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2);
+
         point_scalar_fields(node_gid, 0) = rank;
         point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
         point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid);
+        point_scalar_fields(node_gid, 3) = node.scalar_field.host(node_gid);
     }
 
     // File management
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
index 0da00095..eb3d5a6b 100644
--- a/examples/mesh_decomp/state.h
+++ b/examples/mesh_decomp/state.h
@@ -43,7 +43,9 @@ using namespace mtr;
 // Possible node states, used to initialize node_t
 enum class node_state
 {
-    coords
+    coords,
+    scalar_field,
+    vector_field
 };
 
 
@@ -58,17 +60,68 @@ struct node_t
 {
 
     // Replace with MPIDCArrayKokkos
-    DCArrayKokkos<double> coords;     ///< Nodal coordinates
-    DCArrayKokkos<double> coords_n0;  ///< Nodal coordinates at tn=0 of time integration
+    MPICArrayKokkos<double> coords;     ///< Nodal coordinates
+    MPICArrayKokkos<double> coords_n0;  ///< Nodal coordinates at tn=0 of time integration
     
+    MPICArrayKokkos<double> scalar_field; ///< Scalar field on a node
+    MPICArrayKokkos<double> vector_field; ///< Vector field on a node
+
+
     // initialization method (num_nodes, num_dims, state to allocate)
     void initialize(size_t num_nodes, size_t num_dims, std::vector<node_state> node_states)
+    {
+
+        CommunicationPlan comm_plan;
+        
+        for (auto field : node_states){
+            switch(field){
+                case node_state::coords:
+                    if (coords.size() == 0){
+                        this->coords = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates");
+                        this->coords.initialize_comm_plan(comm_plan);
+                    }
+                    if (coords_n0.size() == 0){
+                        this->coords_n0 = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates_n0");
+                        this->coords_n0.initialize_comm_plan(comm_plan);
+                    }
+                    break;
+                case node_state::scalar_field:
+                    if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos<double>(num_nodes, "node_scalar_field");
+                    this->scalar_field.initialize_comm_plan(comm_plan);
+                    break;
+                case node_state::vector_field:
+                    if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos<double>(num_nodes, num_dims, "node_vector_field");
+                    this->vector_field.initialize_comm_plan(comm_plan);
+                    break;
+                default:
+                    std::cout<<"Desired node state not understood in node_t initialize"<<std::endl;
+                    throw std::runtime_error("**** Error in State Field Name ****");
+            }
+        }
+    }; // end method
+    
+    // initialization method (num_nodes, num_dims, state to allocate)
+    void initialize(size_t num_nodes, size_t num_dims, std::vector<node_state> node_states, CommunicationPlan& comm_plan)
     {
         for (auto field : node_states){
             switch(field){
                 case node_state::coords:
-                    if (coords.size() == 0) this->coords = DCArrayKokkos<double>(num_nodes, num_dims, "node_coordinates");
-                    if (coords_n0.size() == 0) this->coords_n0 = DCArrayKokkos<double>(num_nodes, num_dims, "node_coordinates_n0");
+                    if (coords.size() == 0){
+                        this->coords = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates");
+                        this->coords.initialize_comm_plan(comm_plan);
+                    }
+                    if (coords_n0.size() == 0){
+                        this->coords_n0 = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates_n0");
+                        this->coords_n0.initialize_comm_plan(comm_plan);
+                    }
+                    break;
+                case node_state::scalar_field:
+                    if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos<double>(num_nodes, "node_scalar_field");
+                    this->scalar_field.initialize_comm_plan(comm_plan);
+                    break;
+                case node_state::vector_field:
+                    if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos<double>(num_nodes, num_dims, "node_vector_field");
+                    this->vector_field.initialize_comm_plan(comm_plan);
                     break;
                 default:
                     std::cout<<"Desired node state not understood in node_t initialize"<<std::endl;
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index 2023f609..59f3dab0 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -5,6 +5,8 @@
 #include <mpi.h>
 #include "matar.h"
 
+#include <set>
+
 using namespace mtr;
 
 /**
@@ -19,12 +21,20 @@ using namespace mtr;
  *   elem.density.comm()   -> automatically syncs ghost elements
  * 
  */
+enum class communication_plan_type {
+    no_communication,
+    all_to_all_graph
+};
+
+
  struct CommunicationPlan {
     
     // ========================================================================
     // Metadata for MPI neighbor graph communication 
     // ========================================================================
 
+    communication_plan_type comm_type = communication_plan_type::no_communication;
+
     // MPI world communicator
     MPI_Comm mpi_comm_world;
     bool has_comm_world = false;
@@ -164,6 +174,7 @@ using namespace mtr;
      */
     void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){
         
+        this->comm_type = communication_plan_type::all_to_all_graph;
         // Check if the MPI_COMM_WORLD communicator has been initialized.
         if(!has_comm_world){
             throw std::runtime_error("MPI communicator for the world has not been initialized");
@@ -205,105 +216,105 @@ using namespace mtr;
         has_comm_graph = true;
     }
 
-    // void verify_graph_communicator(){
-    //     if(!has_comm_graph){
-    //         throw std::runtime_error("MPI graph communicator has not been initialized");
-    //     }
+    void verify_graph_communicator(){
+        if(!has_comm_graph){
+            throw std::runtime_error("MPI graph communicator has not been initialized");
+        }
 
-    //     // ============================================================================
-    //     // Verify the distributed graph communicator
-    //     // ============================================================================
-    //     // Query the graph to verify it matches what we specified
-    //     int indegree_out, outdegree_out, weighted;
-    //     MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted);
+        // ============================================================================
+        // Verify the distributed graph communicator
+        // ============================================================================
+        // Query the graph to verify it matches what we specified
+        int indegree_out, outdegree_out, weighted;
+        MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted);
         
-    //     // Allocate arrays to receive neighbor information
-    //     std::vector<int> sources_out(indegree_out);
-    //     std::vector<int> sourceweights_out(indegree_out);
-    //     std::vector<int> destinations_out(outdegree_out);
-    //     std::vector<int> destweights_out(outdegree_out);
+        // Allocate arrays to receive neighbor information
+        std::vector<int> sources_out(indegree_out);
+        std::vector<int> sourceweights_out(indegree_out);
+        std::vector<int> destinations_out(outdegree_out);
+        std::vector<int> destweights_out(outdegree_out);
         
-    //     // Retrieve the actual neighbors from the graph communicator
-    //     MPI_Dist_graph_neighbors(mpi_comm_graph, 
-    //                             indegree_out, sources_out.data(), sourceweights_out.data(),
-    //                             outdegree_out, destinations_out.data(), destweights_out.data());
+        // Retrieve the actual neighbors from the graph communicator
+        MPI_Dist_graph_neighbors(mpi_comm_graph, 
+                                indegree_out, sources_out.data(), sourceweights_out.data(),
+                                outdegree_out, destinations_out.data(), destweights_out.data());
         
-    //     int rank = -1;
-    //     MPI_Comm_rank(mpi_comm_world, &rank);
+        int rank = -1;
+        MPI_Comm_rank(mpi_comm_world, &rank);
 
-    //     // Additional verification: Check if the queried values match our input
-    //     bool verification_passed = true;
+        // Additional verification: Check if the queried values match our input
+        bool verification_passed = true;
         
-    //     // Print verification information for each rank sequentially
-    //     for (int r = 0; r < world_size; ++r) {
-    //         MPI_Barrier(mpi_comm_world);
-    //         if (rank == r) {
-    //             std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
-    //             std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
-    //             for (int i = 0; i < indegree_out; ++i) {
-    //                 std::cout << sources_out[i] << " ";
-    //             }
-    //             std::cout << std::endl;
+        // Print verification information for each rank sequentially
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(mpi_comm_world);
+            if (rank == r) {
+                std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
+                std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
+                for (int i = 0; i < indegree_out; ++i) {
+                    std::cout << sources_out[i] << " ";
+                }
+                std::cout << std::endl;
                 
-    //             std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
-    //             for (int i = 0; i < outdegree_out; ++i) {
-    //                 std::cout << destinations_out[i] << " ";
-    //             }
-    //             std::cout << std::endl;
+                std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
+                for (int i = 0; i < outdegree_out; ++i) {
+                    std::cout << destinations_out[i] << " ";
+                }
+                std::cout << std::endl;
                 
-    //             std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
-    //         }
-    //         MPI_Barrier(mpi_comm_world);
-    //     }
+                std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
+            }
+            MPI_Barrier(mpi_comm_world);
+        }
         
-    //     // Check if the counts match our stored values
-    //     if (indegree_out != num_recv_ranks) {
-    //         std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
-    //                   << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl;
-    //         verification_passed = false;
-    //     }
-    //     if (outdegree_out != num_send_ranks) {
-    //         std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
-    //                   << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl;
-    //         verification_passed = false;
-    //     }
+        // Check if the counts match our stored values
+        if (indegree_out != num_recv_ranks) {
+            std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
+                      << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl;
+            verification_passed = false;
+        }
+        if (outdegree_out != num_send_ranks) {
+            std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
+                      << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl;
+            verification_passed = false;
+        }
         
-    //     // Check if source ranks match (build set from our stored recv_rank_ids)
-    //     std::set<int> sources_set_in;
-    //     for (int i = 0; i < num_recv_ranks; ++i) {
-    //         sources_set_in.insert(recv_rank_ids.host(i));
-    //     }
-    //     std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
-    //     if (sources_set_in != sources_set_out) {
-    //         std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
-    //         verification_passed = false;
-    //     }
+        // Check if source ranks match (build set from our stored recv_rank_ids)
+        std::set<int> sources_set_in;
+        for (int i = 0; i < num_recv_ranks; ++i) {
+            sources_set_in.insert(recv_rank_ids.host(i));
+        }
+        std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
+        if (sources_set_in != sources_set_out) {
+            std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
+            verification_passed = false;
+        }
         
-    //     // Check if destination ranks match (build set from our stored send_rank_ids)
-    //     std::set<int> dests_set_in;
-    //     for (int i = 0; i < num_send_ranks; ++i) {
-    //         dests_set_in.insert(send_rank_ids.host(i));
-    //     }
-    //     std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
-    //     if (dests_set_in != dests_set_out) {
-    //         std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
-    //         verification_passed = false;
-    //     }
+        // Check if destination ranks match (build set from our stored send_rank_ids)
+        std::set<int> dests_set_in;
+        for (int i = 0; i < num_send_ranks; ++i) {
+            dests_set_in.insert(send_rank_ids.host(i));
+        }
+        std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
+        if (dests_set_in != dests_set_out) {
+            std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
+            verification_passed = false;
+        }
         
-    //     // Global verification check
-    //     int local_passed = verification_passed ? 1 : 0;
-    //     int global_passed = 0;
-    //     MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
-    //     MPI_Barrier(mpi_comm_world);
-    //     if (rank == 0) {
-    //         if (global_passed) {
-    //             std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
-    //         } else {
-    //             std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
-    //         }
-    //     }
-    //     MPI_Barrier(mpi_comm_world);
-    // }
+        // Global verification check
+        int local_passed = verification_passed ? 1 : 0;
+        int global_passed = 0;
+        MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
+        MPI_Barrier(mpi_comm_world);
+        if (rank == 0) {
+            if (global_passed) {
+                std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
+            } else {
+                std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
+            }
+        }
+        MPI_Barrier(mpi_comm_world);
+    }
 
     void setup_send_recv(DRaggedRightArrayKokkos<int> &rank_send_ids, DRaggedRightArrayKokkos<int> &rank_recv_ids){
 
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index ac651551..10e58121 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -301,7 +301,7 @@ class MPICArrayKokkos {
         this_array_.update_host();
        
         fill_send_buffer();
-        
+
         MPI_Neighbor_alltoallv(
             send_buffer_.host_pointer(),
             send_counts_.host_pointer(),

From 9c1e9b69c76e5ae842f9873cc64d6c419acab073 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 18 Nov 2025 15:56:05 -0600
Subject: [PATCH 33/52] BUG: Debugging nodal comms, WIP

---
 examples/mesh_decomp/decomp_utils.h  | 336 +++++++++++++++++++++++----
 examples/mesh_decomp/mesh_decomp.cpp |   8 +-
 src/include/communication_plan.h     | 150 ++++++++++++
 3 files changed, 449 insertions(+), 45 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 62986e6e..ff697abe 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -676,9 +676,11 @@ void build_ghost(
     // STEP 2: Build index sets for local elements and nodes
     // ========================================================================
     std::set<size_t> local_node_gids;
+    std::map<size_t, int> global_to_local_node_mapping;  // GID -> local index mapping
     for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) {
         size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid);
         local_node_gids.insert(node_gid);
+        global_to_local_node_mapping[node_gid] = node_rid;
     }
 
     // Build a set of locally-owned element GIDs for quick lookup
@@ -778,8 +780,7 @@ void build_ghost(
             
             // Check if this node belongs to one of our locally-owned elements
             if (local_node_gids.find(node_gid) != local_node_gids.end()) {
-                ghost_node_gids.insert(node_gid);
-                ghost_node_recv_rank[node_gid] = r;
+                
                 // Check if this element is NOT owned by us (i.e., it's from another rank)
                 if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
                     // This is a ghost element for us
@@ -788,6 +789,122 @@ void build_ghost(
             }
         }
     }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    std::map<int, std::set<size_t>> ghost_nodes_from_ranks;
+
+    std::set<size_t> shared_nodes; // nodes on MPI rank boundaries
+    
+    // Iterate through connectivity data from each rank (except ourselves)
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Offset into all_conn for this pair (elem_gid, node_gid)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // Check if this element belongs to one of our ghost elements
+            if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) {
+                
+                // Check if this node is NOT owned by us (i.e., it's from another rank)
+                if (local_node_gids.find(node_gid) == local_node_gids.end()) {
+                    // This is a ghost node for us
+                    ghost_node_gids.insert(node_gid);
+                    ghost_node_recv_rank[node_gid] = r;
+                    ghost_nodes_from_ranks[r].insert(node_gid);
+                }
+            }
+        }
+    }
+
+    // WARNING: HERE IS THE BUG:
+    // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries
+
+    // Create a vecor of the ranks that this rank will receive data from for ghost nodes
+    std::set<int> ghost_node_receive_ranks;
+    for (const auto& pair : ghost_node_recv_rank) {
+        ghost_node_receive_ranks.insert(pair.second);
+    }
+
+    std::vector<int> ghost_node_receive_ranks_vec(ghost_node_receive_ranks.begin(), ghost_node_receive_ranks.end());
+
+    
+    // Print out the ghost node receive ranks for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        if (rank == r) {    
+            MPI_Barrier(MPI_COMM_WORLD);
+            std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: ";
+            for (int r : ghost_node_receive_ranks_vec) {
+                std::cout << r << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    
+    // Find which nodes *we own* are ghosted on other ranks, and on which ranks
+    // We want: for each of our local nodes, the list of ranks that ghost it
+    
+    // Map: local_node_gid -> set of remote ranks that ghost this node
+    std::map<size_t, std::set<int>> local_node_gid_to_ghosting_ranks;
+
+    std::vector<std::set<size_t>> ghosted_nodes_on_ranks(world_size);
+    
+    // Iterate through connectivity from all ranks except ourselves
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue; // skip our own rank
+        
+        int num_pairs = conn_sizes[r] / 2;
+        for (int i = 0; i < num_pairs; i++) {
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this node is owned by us, and remote rank references it, they are ghosting it
+            if (local_node_gids.find(node_gid) != local_node_gids.end()) {
+                local_node_gid_to_ghosting_ranks[node_gid].insert(r);
+                ghosted_nodes_on_ranks[r].insert(node_gid);
+            }
+        }
+    }
+
+    // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes
+    std::set<int> ghost_node_send_ranks;
+    for (const auto& pair : local_node_gid_to_ghosting_ranks) {
+        ghost_node_send_ranks.insert(pair.second.begin(), pair.second.end());
+    }
+    std::vector<int> ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end());
+
+    std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of local node indices
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;
+        for (size_t node_gid : ghosted_nodes_on_ranks[r]) {
+            int local_node_id = global_to_local_node_mapping[node_gid];
+            nodes_to_send_by_rank[r].push_back(local_node_id);
+        }
+    }
+
+    //print out the nodes to send by rank for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        if (rank == r) {
+            std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: ";
+            for (const auto& rank_node_pair : nodes_to_send_by_rank) {
+                std::cout << rank_node_pair.first << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
 
     // Store the count of ghost elements for later use
     input_mesh.num_ghost_elems = ghost_elem_gids.size();
@@ -871,6 +988,7 @@ void build_ghost(
         }
     }
 
+
     // Assign extended local IDs to ghost-only nodes
     for (size_t node_gid : ghost_only_nodes) {
         node_gid_to_extended_lid[node_gid] = extended_node_lid++;
@@ -1055,7 +1173,8 @@ void build_ghost(
     // ****************************************************************************************** 
 
 
-    output_node.initialize(total_extended_nodes, 3, {node_state::coords});
+    output_node.initialize(total_extended_nodes, 3, {node_state::coords}, node_communication_plan);
+    MPI_Barrier(MPI_COMM_WORLD);
 
     // The goal here is to populate output_node.coords using globally gathered ghost node coordinates,
     // since input_node does not contain ghost node coordinates.
@@ -1278,7 +1397,7 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
 
-    
+// Initialize graph comms for elements    
     // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
     // that efficiently represents the communication pattern between ranks.
     // This allows MPI to optimize communication based on the actual connectivity pattern.
@@ -1290,11 +1409,11 @@ void build_ghost(
     std::vector<int> ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), 
                                                     ghost_elem_receive_ranks.end());
     // The number of ranks from which this rank will receive data (incoming neighbors)
-    int indegree = static_cast<int>(ghost_elem_receive_ranks_vec.size());
+    int elem_indegree = static_cast<int>(ghost_elem_receive_ranks_vec.size());
     
     // sources: Array of source rank IDs (ranks we receive from)
     // Each element corresponds to a rank that owns elements we ghost
-    int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+    int* sources = (elem_indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
 
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
@@ -1311,12 +1430,41 @@ void build_ghost(
     int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED;
 
     // Initialize the graph communicator for element communication
-    element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data());
+    element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), elem_indegree, ghost_elem_receive_ranks_vec.data());
     MPI_Barrier(MPI_COMM_WORLD);
     
     // Optional: Verify the graph communicator was created successfully
     // if(print_info) element_communication_plan.verify_graph_communicator();
 
+
+// Initialize graph comms for nodes    
+    // ---------- Prepare INCOMING edges (sources) ----------
+    // indegree: Number of ranks from which this rank will RECEIVE data
+    // These are the ranks that own nodes which are ghosted on this rank
+    int node_indegree = static_cast<int>(ghost_node_receive_ranks.size());
+    int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+    
+    // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    int* node_sourceweights = MPI_UNWEIGHTED;   
+
+    // ---------- Prepare OUTGOING edges (destinations) ----------
+    // outdegree: Number of ranks to which this rank will SEND data
+    // These are the ranks that ghost nodes owned by this rank
+    int node_outdegree = static_cast<int>(ghost_node_send_ranks.size());
+    int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED;
+
+    // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
+    int* node_destinationweights = MPI_UNWEIGHTED;
+
+    // Initialize the graph communicator for node communication
+    node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Optional: Verify the graph communicator was created successfully
+    print_info = true;
+    if(print_info) node_communication_plan.verify_graph_communicator();
+    print_info = false;
+
 // ****************************************************************************************** 
 //     Build send counts and displacements for element communication
 // ****************************************************************************************** 
@@ -1399,10 +1547,112 @@ void build_ghost(
     // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
     // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
     // --------------------------------------------------------------------------------------
+
+
+    // Print out the nodes to send by rank for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        if (rank == r) {
+            std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: ";
+            for (const auto& rank_node_pair : nodes_to_send_by_rank) {
+                std::cout << rank_node_pair.first << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
+    // Serialize into a DRaggedRightArrayKokkos
+    CArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks);
+    for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
+        int dest_rank = node_communication_plan.send_rank_ids.host(i);
+        node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size();
+    }
+    DRaggedRightArrayKokkos<int> nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank");
+
+    // Fill in the data
+    for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
+        int dest_rank = node_communication_plan.send_rank_ids.host(i);
+        for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
+            nodes_to_send_by_rank_rr.host(i, j) = nodes_to_send_by_rank[dest_rank][j];
+        }
+    }
+    nodes_to_send_by_rank_rr.update_device();
+
+
+
+    // Count how many ghost nodes come from each source rank
+    std::map<int, std::vector<int>> nodes_to_recv_by_rank;  // rank -> list of ghost node indices
+    int ghost_node_index = 0;
+    for (size_t ghost_node_gid : ghost_node_gids) {
+        int source_rank = ghost_node_recv_rank[ghost_node_gid];
+        int ghost_node_local_id = output_mesh.num_owned_nodes + ghost_node_index;
+        nodes_to_recv_by_rank[source_rank].push_back(ghost_node_local_id);
+        ghost_node_index++;
+    }
     
-   
+    // Serialize into a DRaggedRightArrayKokkos
+    CArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks);
+    for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
+        int source_rank = node_communication_plan.recv_rank_ids.host(i);
+        nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size();
+    }
+    DRaggedRightArrayKokkos<int> nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank");
+    // Fill in the data
+    for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
+        int source_rank = node_communication_plan.recv_rank_ids.host(i);
+        for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
+            size_t local_id = nodes_to_recv_by_rank[source_rank][j];
+            nodes_to_recv_by_rank_rr.host(i, j) = nodes_to_recv_by_rank[source_rank][j];
+        }
+    }
+    nodes_to_recv_by_rank_rr.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // print the nodes to send by rank rr for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        if (rank == r) {
+            std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl;
+            for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
+                int dest_rank = node_communication_plan.send_rank_ids.host(i);
+                std::cout << "  To rank " << dest_rank << ": [";
+                for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
+                    int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]);
+                    std::cout << global_node_id << " ";
+                }
+                std::cout << "]" << std::endl;
+            }
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
+    // print the nodes to send by rank rr for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        if (rank == r) {
+            std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl;
+            for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
+                int source_rank = node_communication_plan.recv_rank_ids.host(i);
+                std::cout << "  From rank " << source_rank << ": [";
+                for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
+                    int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_recv_by_rank[source_rank][j]);
+                    std::cout << global_node_id << " ";
+                }
+                std::cout << "]" << std::endl;
+            }
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
 
 
+
+
+    node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    node_communication_plan.verify_send_recv();
+
 }
 
 
@@ -2106,41 +2356,41 @@ void partition_mesh(
     std::vector<node_state> node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field};
     final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan);
     
-    for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
-        final_node.scalar_field.host(i) = static_cast<double>(rank);
-        final_node.vector_field.host(i, 0) = static_cast<double>(rank);
-        final_node.vector_field.host(i, 1) = static_cast<double>(rank);
-        final_node.vector_field.host(i, 2) = static_cast<double>(rank);
-    }
-    for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
-        final_node.scalar_field.host(i) = -1.0;
-        final_node.vector_field.host(i, 0) = -1.0;
-        final_node.vector_field.host(i, 1) = -1.0;
-        final_node.vector_field.host(i, 2) = -1.0;
-    }
-
-    final_node.coords.update_device();
-    final_node.scalar_field.update_device();
-    final_node.vector_field.update_device();
-
-    final_node.scalar_field.communicate();
-    final_node.vector_field.communicate();
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-    // Update scalar field to visualize the communication
-
-    for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
-        double value = 0.0;
-        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j));
-        }
-        value /= final_mesh.num_nodes_in_elem;
-
-        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
-        }
-    }
+    // for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
+    //     final_node.scalar_field.host(i) = static_cast<double>(rank);
+    //     final_node.vector_field.host(i, 0) = static_cast<double>(rank);
+    //     final_node.vector_field.host(i, 1) = static_cast<double>(rank);
+    //     final_node.vector_field.host(i, 2) = static_cast<double>(rank);
+    // }
+    // for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
+    //     final_node.scalar_field.host(i) = -1.0;
+    //     final_node.vector_field.host(i, 0) = -1.0;
+    //     final_node.vector_field.host(i, 1) = -1.0;
+    //     final_node.vector_field.host(i, 2) = -1.0;
+    // }
+
+    // final_node.coords.update_device();
+    // final_node.scalar_field.update_device();
+    // final_node.vector_field.update_device();
+
+    // final_node.scalar_field.communicate();
+    // // final_node.vector_field.communicate();
+    // MPI_Barrier(MPI_COMM_WORLD);
+
+
+    // // Update scalar field to visualize the communication
+
+    // for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
+    //     double value = 0.0;
+    //     for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
+    //         value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j));
+    //     }
+    //     value /= final_mesh.num_nodes_in_elem;
+
+    //     for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
+    //         final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
+    //     }
+    // }
 
 
    
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 2113e9d6..542628bd 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -33,8 +33,8 @@ int main(int argc, char** argv) {
 
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
-    double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {100, 100, 100};
+    double length[3] = {1.0, 0.5, 0.5};
+    int num_elems_dim[3] = {2, 1, 1};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
@@ -58,6 +58,9 @@ int main(int argc, char** argv) {
         std::cout<<"Initializing mesh"<<std::endl;
         build_3d_box(initial_mesh,  initial_node, origin, length, num_elems_dim);
 
+        
+
+        // write_vtk(initial_mesh, initial_node, rank);
 
         // Read the mesh from a file
         // read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk");
@@ -67,6 +70,7 @@ int main(int argc, char** argv) {
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
+    
 // ********************************************************  
 //             Partition and balance the mesh
 // ********************************************************  
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index 59f3dab0..3c1c48e9 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -359,6 +359,156 @@ enum class communication_plan_type {
 
         MPI_Barrier(mpi_comm_world);
     }
+
+    void verify_send_recv(){
+        
+        if(!has_comm_graph){
+            throw std::runtime_error("Graph communicator has not been initialized");
+        }
+
+        int rank = -1;
+        MPI_Comm_rank(mpi_comm_world, &rank);
+
+        bool local_verification_passed = true;
+
+        // ============================================================================
+        // Local Verification: Check consistency of counts and displacements
+        // ============================================================================
+
+        // Verify send counts and displacements
+        int computed_total_send = 0;
+        for(int i = 0; i < num_send_ranks; i++){
+            computed_total_send += send_counts_.host(i);
+            
+            // Verify displacements are consistent
+            int expected_displs = 0;
+            for(int j = 0; j < i; j++){
+                expected_displs += send_counts_.host(j);
+            }
+            if(send_displs_.host(i) != expected_displs){
+                std::cerr << "[rank " << rank << "] ERROR: send_displs[" << i << "] mismatch! "
+                          << "Expected " << expected_displs << ", got " << send_displs_.host(i) << std::endl;
+                local_verification_passed = false;
+            }
+        }
+
+        // Verify total send count
+        if(computed_total_send != total_send_count){
+            std::cerr << "[rank " << rank << "] ERROR: total_send_count mismatch! "
+                      << "Expected " << computed_total_send << ", got " << total_send_count << std::endl;
+            local_verification_passed = false;
+        }
+
+        // Verify recv counts and displacements
+        int computed_total_recv = 0;
+        for(int i = 0; i < num_recv_ranks; i++){
+            computed_total_recv += recv_counts_.host(i);
+            
+            // Verify displacements are consistent
+            int expected_displs = 0;
+            for(int j = 0; j < i; j++){
+                expected_displs += recv_counts_.host(j);
+            }
+            if(recv_displs_.host(i) != expected_displs){
+                std::cerr << "[rank " << rank << "] ERROR: recv_displs[" << i << "] mismatch! "
+                          << "Expected " << expected_displs << ", got " << recv_displs_.host(i) << std::endl;
+                local_verification_passed = false;
+            }
+        }
+
+        // Verify total recv count
+        if(computed_total_recv != total_recv_count){
+            std::cerr << "[rank " << rank << "] ERROR: total_recv_count mismatch! "
+                      << "Expected " << computed_total_recv << ", got " << total_recv_count << std::endl;
+            local_verification_passed = false;
+        }
+
+        // Verify send indices are within bounds (basic sanity check)
+        for(int i = 0; i < num_send_ranks; i++){
+            for(int j = 0; j < send_indices_.stride_host(i); j++){
+                int idx = send_indices_.host(i, j);
+                if(idx < 0){
+                    std::cerr << "[rank " << rank << "] ERROR: negative send index at rank " << i 
+                              << ", index " << j << ": " << idx << std::endl;
+                    local_verification_passed = false;
+                }
+            }
+        }
+
+        // Verify recv indices are within bounds (basic sanity check)
+        for(int i = 0; i < num_recv_ranks; i++){
+            for(int j = 0; j < recv_indices_.stride_host(i); j++){
+                int idx = recv_indices_.host(i, j);
+                if(idx < 0){
+                    std::cerr << "[rank " << rank << "] ERROR: negative recv index at rank " << i 
+                              << ", index " << j << ": " << idx << std::endl;
+                    local_verification_passed = false;
+                }
+            }
+        }
+
+        // ============================================================================
+        // Print local verification information for each rank sequentially
+        // ============================================================================
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(mpi_comm_world);
+            if (rank == r) {
+                std::cout << "\n[rank " << rank << "] Send/Recv Communication Plan Verification:" << std::endl;
+                
+                std::cout << "  Send Configuration:" << std::endl;
+                std::cout << "    - Num send ranks: " << num_send_ranks << std::endl;
+                std::cout << "    - Total send count: " << total_send_count << std::endl;
+                std::cout << "    - Send counts per rank: ";
+                for (int i = 0; i < num_send_ranks; ++i) {
+                    std::cout << send_counts_.host(i) << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "    - Send displacements: ";
+                for (int i = 0; i < num_send_ranks; ++i) {
+                    std::cout << send_displs_.host(i) << " ";
+                }
+                std::cout << std::endl;
+                
+                std::cout << "  Recv Configuration:" << std::endl;
+                std::cout << "    - Num recv ranks: " << num_recv_ranks << std::endl;
+                std::cout << "    - Total recv count: " << total_recv_count << std::endl;
+                std::cout << "    - Recv counts per rank: ";
+                for (int i = 0; i < num_recv_ranks; ++i) {
+                    std::cout << recv_counts_.host(i) << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "    - Recv displacements: ";
+                for (int i = 0; i < num_recv_ranks; ++i) {
+                    std::cout << recv_displs_.host(i) << " ";
+                }
+                std::cout << std::endl;
+            }
+            MPI_Barrier(mpi_comm_world);
+        }
+
+        // ============================================================================
+        // Global Verification: Use MPI to verify consistency across ranks
+        // ============================================================================
+        int local_passed = local_verification_passed ? 1 : 0;
+        int global_passed = 0;
+        MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
+        MPI_Barrier(mpi_comm_world);
+
+        if (rank == 0) {
+            if (global_passed) {
+                std::cout << "\n✓ Send/Recv communication plan verification PASSED on all ranks\n" << std::endl;
+            } else {
+                std::cout << "\n✗ Send/Recv communication plan verification FAILED on one or more ranks\n" << std::endl;
+            }
+        }
+        MPI_Barrier(mpi_comm_world);
+
+        if(!global_passed){
+            throw std::runtime_error("Send/Recv communication plan verification failed");
+        }
+    }
+
+
 }; // End of CommunicationPlan
 
 #endif // end if HAVE_MPI

From 6aa871e5f34ec25a6d3a4e97ea8a47a71555f98b Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 19 Nov 2025 16:57:02 -0600
Subject: [PATCH 34/52] BUG: Debugging nodal send/recv WIP, working for 2x1
 mesh

---
 examples/mesh_decomp/decomp_utils.h | 196 ++++++++++++++++++++++++----
 1 file changed, 170 insertions(+), 26 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index ff697abe..3acf006d 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -792,9 +792,23 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
 
+    // Print out the ghost elements for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "Rank " << rank << " has the following ghost elements: ";
+            for (const auto& elem_gid : ghost_elem_gids) {
+                std::cout << elem_gid << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
     std::map<int, std::set<size_t>> ghost_nodes_from_ranks;
 
-    std::set<size_t> shared_nodes; // nodes on MPI rank boundaries
+    
     
     // Iterate through connectivity data from each rank (except ourselves)
     for (int r = 0; r < world_size; r++) {
@@ -825,6 +839,63 @@ void build_ghost(
         }
     }
 
+    std::set<size_t> shared_nodes; // nodes on MPI rank boundaries
+    // Iterate through connectivity data from each rank (except ourselves) to find shared nodes
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Offset into all_conn for this pair (elem_gid, node_gid)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // Check if this element belongs to one of our ghost elements
+            if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) {
+                // If another rank references a node that is also owned by us, it is a shared node
+                if (local_node_gids.find(node_gid) != local_node_gids.end()) {
+                    shared_nodes.insert(node_gid);
+                    
+                }
+            }
+        }
+    }
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "Rank " << rank << " has the following shared nodes: ";
+            for (const auto& node_gid : shared_nodes) {
+                std::cout << node_gid << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Print out the ghost nodes for each rank sequentially
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "Rank " << rank << " has the following ghost nodes: ";
+            for (const auto& node_gid : ghost_node_gids) {
+                std::cout << node_gid << " ";
+            }
+            std::cout << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
     // WARNING: HERE IS THE BUG:
     // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries
 
@@ -857,7 +928,7 @@ void build_ghost(
     // Map: local_node_gid -> set of remote ranks that ghost this node
     std::map<size_t, std::set<int>> local_node_gid_to_ghosting_ranks;
 
-    std::vector<std::set<size_t>> ghosted_nodes_on_ranks(world_size);
+    std::vector<std::set<size_t>> shared_nodes_on_ranks(world_size);
     
     // Iterate through connectivity from all ranks except ourselves
     for (int r = 0; r < world_size; r++) {
@@ -872,11 +943,13 @@ void build_ghost(
             // If this node is owned by us, and remote rank references it, they are ghosting it
             if (local_node_gids.find(node_gid) != local_node_gids.end()) {
                 local_node_gid_to_ghosting_ranks[node_gid].insert(r);
-                ghosted_nodes_on_ranks[r].insert(node_gid);
+                shared_nodes_on_ranks[r].insert(node_gid);
             }
         }
     }
 
+    // WARNING: THE PREVIOUS STEP MUST INCLUDE ALL NODES AFTER MOVING GHOST NODES ONTO THIS RANK, and must be filtered to not include shared ndoes
+
     // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes
     std::set<int> ghost_node_send_ranks;
     for (const auto& pair : local_node_gid_to_ghosting_ranks) {
@@ -884,27 +957,14 @@ void build_ghost(
     }
     std::vector<int> ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end());
 
-    std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of local node indices
-    for (int r = 0; r < world_size; r++) {
-        if (r == rank) continue;
-        for (size_t node_gid : ghosted_nodes_on_ranks[r]) {
-            int local_node_id = global_to_local_node_mapping[node_gid];
-            nodes_to_send_by_rank[r].push_back(local_node_id);
-        }
-    }
-
-    //print out the nodes to send by rank for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        if (rank == r) {
-            std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: ";
-            for (const auto& rank_node_pair : nodes_to_send_by_rank) {
-                std::cout << rank_node_pair.first << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
+    // std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of local node indices
+    // for (int r = 0; r < world_size; r++) {
+    //     if (r == rank) continue;
+    //     for (size_t node_gid : shared_nodes_on_ranks[r]) {
+    //         int local_node_id = global_to_local_node_mapping[node_gid];
+    //         nodes_to_send_by_rank[r].push_back(local_node_id);
+    //     }
+    // }
 
     // Store the count of ghost elements for later use
     input_mesh.num_ghost_elems = ghost_elem_gids.size();
@@ -996,6 +1056,8 @@ void build_ghost(
 
     int total_extended_nodes = extended_node_lid;
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    
     // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
     // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
     std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
@@ -1163,6 +1225,15 @@ void build_ghost(
     output_mesh.nodes_in_elem.update_device();
     output_mesh.build_connectivity();
 
+
+   
+
+
+
+
+
+
+
     MPI_Barrier(MPI_COMM_WORLD);
 
     if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
@@ -1397,6 +1468,77 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
 
+
+    std::map<int, std::set<size_t>> node_set_to_send_by_rank;
+
+    // For each owned element that will be ghosted on other ranks,
+    // collect the nodes that need to be sent to those ranks
+    // boundary_elem_targets[elem_lid] contains pairs (rank, elem_gid) for ranks that ghost this element
+    for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) {
+        // Get ranks that will ghost this element
+        for (const auto& pair : boundary_elem_targets[elem_lid]) {
+            int ghosting_rank = pair.first;
+            
+            // For each node in this element
+            for (int j = 0; j < nodes_per_elem; j++) {
+                size_t node_lid = input_mesh.nodes_in_elem.host(elem_lid, j);
+                size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid);
+                
+                // Only send nodes that are NOT shared (not on MPI rank boundary)
+                // Shared nodes are already known to both ranks
+                if (shared_nodes.find(node_gid) == shared_nodes.end()) {
+                    node_set_to_send_by_rank[ghosting_rank].insert(node_gid);
+                }
+            }
+        }
+    }
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Print out node_set_to_send_by_rank for each rank sequentially
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl;
+            for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) {
+                std::cout << "  To rank " << dest_rank << ": [";
+                for (size_t node_gid : node_gids) {
+                    std::cout << node_gid << " ";
+                }
+                std::cout << "]" << std::endl;
+            }
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of global node indices
+
+    // Copy the node_set_to_send_by_rank map to nodes_to_send_by_rank
+    for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) {
+        for (size_t node_gid : node_gids) {
+            nodes_to_send_by_rank[dest_rank].push_back(node_gid);
+        }
+    }
+
+    // Print out nodes_to_send_by_rank for each rank sequentially
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (rank == r) {
+            std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl;
+            for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) {
+                std::cout << "  To rank " << dest_rank << ": [";
+                for (size_t node_gid : node_gids) {
+                    std::cout << node_gid << " ";
+                }
+                std::cout << "]" << std::endl;
+            }
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+
 // Initialize graph comms for elements    
     // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
     // that efficiently represents the communication pattern between ranks.
@@ -1612,13 +1754,14 @@ void build_ghost(
 
     // print the nodes to send by rank rr for each rank sequentially
     for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
         if (rank == r) {
             std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl;
             for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
                 int dest_rank = node_communication_plan.send_rank_ids.host(i);
                 std::cout << "  To rank " << dest_rank << ": [";
                 for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-                    int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]);
+                    int global_node_id = nodes_to_send_by_rank[dest_rank][j];
                     std::cout << global_node_id << " ";
                 }
                 std::cout << "]" << std::endl;
@@ -1627,7 +1770,8 @@ void build_ghost(
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
-
+    MPI_Barrier(MPI_COMM_WORLD);
+    
     // print the nodes to send by rank rr for each rank sequentially
     for (int r = 0; r < world_size; r++) {
         if (rank == r) {

From f7e350db303c84eea536cdb5f705fb0f8b600e79 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 19 Nov 2025 17:17:07 -0600
Subject: [PATCH 35/52] ENH: Node send ids are working, now to fix recv

---
 examples/mesh_decomp/decomp_utils.h  | 7 ++++---
 examples/mesh_decomp/mesh_decomp.cpp | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 3acf006d..1528bf60 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -1486,7 +1486,7 @@ void build_ghost(
                 
                 // Only send nodes that are NOT shared (not on MPI rank boundary)
                 // Shared nodes are already known to both ranks
-                if (shared_nodes.find(node_gid) == shared_nodes.end()) {
+                if (shared_nodes_on_ranks[ghosting_rank].find(node_gid) == shared_nodes_on_ranks[ghosting_rank].end()) { // WARNING: THIS SHOULD BE MOFIFIED TO ONLY FILTER SHARED NODES WITH THIS SPECIFIC RANK
                     node_set_to_send_by_rank[ghosting_rank].insert(node_gid);
                 }
             }
@@ -1716,7 +1716,8 @@ void build_ghost(
     for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
         int dest_rank = node_communication_plan.send_rank_ids.host(i);
         for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-            nodes_to_send_by_rank_rr.host(i, j) = nodes_to_send_by_rank[dest_rank][j];
+            int node_gid = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]);
+            nodes_to_send_by_rank_rr.host(i, j) = node_gid;
         }
     }
     nodes_to_send_by_rank_rr.update_device();
@@ -1761,7 +1762,7 @@ void build_ghost(
                 int dest_rank = node_communication_plan.send_rank_ids.host(i);
                 std::cout << "  To rank " << dest_rank << ": [";
                 for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-                    int global_node_id = nodes_to_send_by_rank[dest_rank][j];
+                    int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]);
                     std::cout << global_node_id << " ";
                 }
                 std::cout << "]" << std::endl;
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 542628bd..b2f9f691 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -33,8 +33,8 @@ int main(int argc, char** argv) {
 
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
-    double length[3] = {1.0, 0.5, 0.5};
-    int num_elems_dim[3] = {2, 1, 1};
+    double length[3] = {1.0, 1.0, 0.5};
+    int num_elems_dim[3] = {2, 2, 1};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;

From 72e0f5b18306a2e8d8e3b61c0c5398d531ebce72 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Wed, 19 Nov 2025 17:36:16 -0600
Subject: [PATCH 36/52] ENH: send/recv match, need to be swapped to local ids

---
 examples/mesh_decomp/decomp_utils.h | 61 ++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 1528bf60..717b31fe 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -1724,14 +1724,54 @@ void build_ghost(
 
 
 
-    // Count how many ghost nodes come from each source rank
-    std::map<int, std::vector<int>> nodes_to_recv_by_rank;  // rank -> list of ghost node indices
-    int ghost_node_index = 0;
-    for (size_t ghost_node_gid : ghost_node_gids) {
-        int source_rank = ghost_node_recv_rank[ghost_node_gid];
-        int ghost_node_local_id = output_mesh.num_owned_nodes + ghost_node_index;
-        nodes_to_recv_by_rank[source_rank].push_back(ghost_node_local_id);
-        ghost_node_index++;
+    // For each ghost element, determine which nodes need to be received from the owning rank
+    // Build the receive list based on ghost element nodes, not on ghost_node_gids
+    // This ensures we receive all nodes needed by ghost elements
+    std::map<int, std::set<size_t>> node_set_to_recv_by_rank;  // rank -> set of node GIDs to receive
+    
+    for (int i = 0; i < output_mesh.num_ghost_elems; i++) {
+        int ghost_elem_lid = output_mesh.num_owned_elems + i;
+        size_t ghost_elem_gid = output_mesh.local_to_global_elem_mapping.host(ghost_elem_lid);
+        int owning_rank = elem_gid_to_rank.at(ghost_elem_gid);
+        
+        // Collect all nodes in this ghost element
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = output_mesh.nodes_in_elem.host(ghost_elem_lid, j);
+            size_t node_gid = output_mesh.local_to_global_node_mapping.host(node_lid);
+            
+            // Only receive nodes that:
+            // 1. We don't own (not in local_node_gids)
+            // 2. Are NOT shared (not on MPI rank boundary)
+            // Shared nodes are already known to both ranks via element connectivity
+            if (local_node_gids.find(node_gid) == local_node_gids.end() && 
+                shared_nodes_on_ranks[owning_rank].find(node_gid) == shared_nodes_on_ranks[owning_rank].end()) {
+                node_set_to_recv_by_rank[owning_rank].insert(node_gid);
+            }
+        }
+    }
+    
+    // Convert node GIDs to local indices and build nodes_to_recv_by_rank
+    std::map<int, std::vector<int>> nodes_to_recv_by_rank;  // rank -> list of ghost node local indices
+    std::map<size_t, int> node_gid_to_ghost_lid;  // map ghost node GID to its local index in output_mesh
+    
+    // Build the GID->local index mapping for ALL ghost nodes in output_mesh
+    // Ghost nodes are those with local IDs >= num_owned_nodes
+    for (int i = output_mesh.num_owned_nodes; i < output_mesh.num_nodes; i++) {
+        size_t node_gid = output_mesh.local_to_global_node_mapping.host(i);
+        node_gid_to_ghost_lid[node_gid] = i;
+    }
+    
+    // Now convert the GID sets to local index vectors
+    for (const auto& pair : node_set_to_recv_by_rank) {
+        int source_rank = pair.first;
+        const std::set<size_t>& node_gids = pair.second;
+        
+        for (size_t node_gid : node_gids) {
+            auto it = node_gid_to_ghost_lid.find(node_gid);
+            if (it != node_gid_to_ghost_lid.end()) {
+                nodes_to_recv_by_rank[source_rank].push_back(it->second);
+            }
+        }
     }
     
     // Serialize into a DRaggedRightArrayKokkos
@@ -1762,7 +1802,7 @@ void build_ghost(
                 int dest_rank = node_communication_plan.send_rank_ids.host(i);
                 std::cout << "  To rank " << dest_rank << ": [";
                 for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-                    int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]);
+                    int global_node_id = nodes_to_send_by_rank[dest_rank][j];
                     std::cout << global_node_id << " ";
                 }
                 std::cout << "]" << std::endl;
@@ -1781,7 +1821,8 @@ void build_ghost(
                 int source_rank = node_communication_plan.recv_rank_ids.host(i);
                 std::cout << "  From rank " << source_rank << ": [";
                 for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
-                    int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_recv_by_rank[source_rank][j]);
+                    int node_lid = nodes_to_recv_by_rank[source_rank][j];
+                    size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid);
                     std::cout << global_node_id << " ";
                 }
                 std::cout << "]" << std::endl;

From 6332734a9650aee45ddd9503cd09782b8e9880da Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Thu, 20 Nov 2025 13:35:52 -0600
Subject: [PATCH 37/52] ENH: Nodal comms working

---
 examples/mesh_decomp/decomp_utils.h  | 459 ++++++++++++++++-----------
 examples/mesh_decomp/mesh_decomp.cpp |   4 +-
 examples/mesh_decomp/mesh_io.h       |  14 +-
 src/include/mpi_types.h              |   4 +
 4 files changed, 281 insertions(+), 200 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 717b31fe..0e7cedff 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -105,6 +105,10 @@ void naive_partition_mesh(
     // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
     //             void *recvbuf, int recvcount, MPI_Datatype recvtype,
     //             int root, MPI_Comm comm)
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting the scatter operation for the element counts per rank"<<std::endl;
+
     double t_scatter_start = MPI_Wtime();
     MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
                 &num_elements_on_rank, 1, MPI_INT, 
@@ -177,6 +181,8 @@ void naive_partition_mesh(
     // Wait for all ranks to complete the scatter operation
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering element counts per rank"<<std::endl;
 
     // ****************************************************************************************** 
     //     Scatter the number of nodes to each rank and compute which nodes to send to each rank
@@ -217,6 +223,9 @@ void naive_partition_mesh(
     // Send the number of nodes to each rank using MPI_scatter
     MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the number of nodes to each rank"<<std::endl;
+
     // resize the nodes_on_rank vector to hold the received data
     nodes_on_rank.resize(num_nodes_on_rank);
 
@@ -274,6 +283,9 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the node global ids to each rank"<<std::endl;
+
     // ****************************************************************************************** 
     //     Scatter the node positions to each rank
     // ****************************************************************************************** 
@@ -326,12 +338,17 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the node positions to each rank"<<std::endl;
+
     // ****************************************************************************************** 
     //     Initialize the node state variables
     // ****************************************************************************************** 
 
     // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
     std::vector<node_state> required_node_state = { node_state::coords };
+    
+    
     naive_node.initialize(num_nodes_on_rank, 3, required_node_state);
 
     for(int i = 0; i < num_nodes_on_rank; i++) {
@@ -343,6 +360,10 @@ void naive_partition_mesh(
     naive_node.coords.update_device();
 
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After initializing the node state variables"<<std::endl;
+
+
     // ****************************************************************************************** 
     //     Send the element-node connectivity data from the initial mesh to each rank
     // ****************************************************************************************** 
@@ -351,40 +372,73 @@ void naive_partition_mesh(
     std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
     
     double t_scatter_elemnode_start = MPI_Wtime();
-
-    if (rank == 0) {
-        // Prepare element-node connectivity data for each rank
-        std::vector<int> all_nodes_in_elem;
-        std::vector<int> sendcounts(world_size);
-        std::vector<int> displs(world_size);
+    MPI_Barrier(MPI_COMM_WORLD);
+    // if (rank == 0) {
+    //     // Prepare element-node connectivity data for each rank
+    //     std::vector<int> all_nodes_in_elem;
+    //     std::vector<int> sendcounts(world_size);
+    //     std::vector<int> displs(world_size);
         
-        int displacement = 0;
-        for(int i = 0; i < world_size; i++) {
-            int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element
-            sendcounts[i] = num_connectivity_entries;
-            displs[i] = displacement;
+    //     int displacement = 0;
+    //     for(int i = 0; i < world_size; i++) {
+    //         int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element
+    //         sendcounts[i] = num_connectivity_entries;
+    //         displs[i] = displacement;
             
-            // Copy element-node connectivity for rank i
-            for(int j = 0; j < elements_to_send[i].size(); j++) {
-                for(int k = 0; k < num_nodes_per_elem; k++) {
-                    all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
-                }
-            }
-            displacement += num_connectivity_entries;
-        }
-        // Send the connectivity data to each rank
-        MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
-                     0, MPI_COMM_WORLD);
+    //         // Copy element-node connectivity for rank i
+    //         for(int j = 0; j < elements_to_send[i].size(); j++) {
+    //             for(int k = 0; k < num_nodes_per_elem; k++) {
+    //                 all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
+    //             }
+    //         }
+    //         displacement += num_connectivity_entries;
+    //     }
+    //     // Send the connectivity data to each rank
+    //     MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
+    //                  nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
+    //                  0, MPI_COMM_WORLD);
+    // }
+    // else {
+    //     MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+    //                  nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
+    //                  0, MPI_COMM_WORLD);
+    // }
+
+    // MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" before scattering the element-node connectivity data to each rank"<<std::endl;
+    // Instead of staging a full copy of the connectivity data per-rank, compute the
+    // scatter counts/displacements directly from the contiguous global array.
+    std::vector<int> conn_sendcounts(world_size);
+    std::vector<int> conn_displs(world_size);
+    int conn_displacement = 0;
+    for (int i = 0; i < world_size; i++) {
+        conn_sendcounts[i] = elems_per_rank[i] * num_nodes_per_elem;
+        conn_displs[i] = conn_displacement;
+        conn_displacement += conn_sendcounts[i];
     }
-    else {
-        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
-                     0, MPI_COMM_WORLD);
+
+    // Scatter using the native storage type (size_t) and then convert locally to int
+    size_t* global_nodes_in_elem = (rank == 0)
+        ? initial_mesh.nodes_in_elem.host_pointer()
+        : nullptr;
+
+    std::vector<size_t> nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem);
+
+    MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG,
+                 nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG,
+                 0, MPI_COMM_WORLD);
+
+    for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) {
+        nodes_in_elem_on_rank[idx] = static_cast<int>(nodes_in_elem_on_rank_size_t[idx]);
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the element-node connectivity data to each rank"<<std::endl;
+
 
     // ****************************************************************************************** 
     //     Send the element-element connectivity data from the initial mesh to each rank
@@ -415,6 +469,7 @@ void naive_partition_mesh(
                 0, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the element-element connectivity data to each rank"<<std::endl;
 
     elems_in_elem_on_rank.resize(total_elem_elem_entries);
     
@@ -445,6 +500,9 @@ void naive_partition_mesh(
     
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the num_elems_in_elem for each element on each rank"<<std::endl;
+
     if (rank == 0){
         // Prepare the element-element connectivity data for each rank
         std::vector<int> all_elems_in_elem;
@@ -479,6 +537,9 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" After scattering the element-element connectivity data to each rank"<<std::endl;
+
 
     // ****************************************************************************************** 
     //     Initialize the naive_mesh data structures for each rank
@@ -793,17 +854,17 @@ void build_ghost(
 
 
     // Print out the ghost elements for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "Rank " << rank << " has the following ghost elements: ";
-            for (const auto& elem_gid : ghost_elem_gids) {
-                std::cout << elem_gid << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // for (int r = 0; r < world_size; r++) {
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    //     if (rank == r) {
+    //         std::cout << "Rank " << rank << " has the following ghost elements: ";
+    //         for (const auto& elem_gid : ghost_elem_gids) {
+    //             std::cout << elem_gid << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
 
     std::map<int, std::set<size_t>> ghost_nodes_from_ranks;
@@ -869,32 +930,32 @@ void build_ghost(
 
 
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int r = 0; r < world_size; r++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "Rank " << rank << " has the following shared nodes: ";
-            for (const auto& node_gid : shared_nodes) {
-                std::cout << node_gid << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // for (int r = 0; r < world_size; r++) {
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    //     if (rank == r) {
+    //         std::cout << "Rank " << rank << " has the following shared nodes: ";
+    //         for (const auto& node_gid : shared_nodes) {
+    //             std::cout << node_gid << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    // Print out the ghost nodes for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "Rank " << rank << " has the following ghost nodes: ";
-            for (const auto& node_gid : ghost_node_gids) {
-                std::cout << node_gid << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // // Print out the ghost nodes for each rank sequentially
+    // for (int r = 0; r < world_size; r++) {
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    //     if (rank == r) {
+    //         std::cout << "Rank " << rank << " has the following ghost nodes: ";
+    //         for (const auto& node_gid : ghost_node_gids) {
+    //             std::cout << node_gid << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
     // WARNING: HERE IS THE BUG:
     // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries
@@ -909,17 +970,17 @@ void build_ghost(
 
     
     // Print out the ghost node receive ranks for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        if (rank == r) {    
-            MPI_Barrier(MPI_COMM_WORLD);
-            std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: ";
-            for (int r : ghost_node_receive_ranks_vec) {
-                std::cout << r << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // for (int r = 0; r < world_size; r++) {
+    //     if (rank == r) {    
+    //         MPI_Barrier(MPI_COMM_WORLD);
+    //         std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: ";
+    //         for (int r : ghost_node_receive_ranks_vec) {
+    //             std::cout << r << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
     
     // Find which nodes *we own* are ghosted on other ranks, and on which ranks
@@ -1496,21 +1557,21 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
     // Print out node_set_to_send_by_rank for each rank sequentially
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int r = 0; r < world_size; r++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl;
-            for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) {
-                std::cout << "  To rank " << dest_rank << ": [";
-                for (size_t node_gid : node_gids) {
-                    std::cout << node_gid << " ";
-                }
-                std::cout << "]" << std::endl;
-            }
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // for (int r = 0; r < world_size; r++) {
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    //     if (rank == r) {
+    //         std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl;
+    //         for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) {
+    //             std::cout << "  To rank " << dest_rank << ": [";
+    //             for (size_t node_gid : node_gids) {
+    //                 std::cout << node_gid << " ";
+    //             }
+    //             std::cout << "]" << std::endl;
+    //         }
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
     std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of global node indices
 
@@ -1521,25 +1582,25 @@ void build_ghost(
         }
     }
 
-    // Print out nodes_to_send_by_rank for each rank sequentially
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int r = 0; r < world_size; r++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl;
-            for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) {
-                std::cout << "  To rank " << dest_rank << ": [";
-                for (size_t node_gid : node_gids) {
-                    std::cout << node_gid << " ";
-                }
-                std::cout << "]" << std::endl;
-            }
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // // Print out nodes_to_send_by_rank for each rank sequentially
+    // MPI_Barrier(MPI_COMM_WORLD);
+    // for (int r = 0; r < world_size; r++) {
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    //     if (rank == r) {
+    //         std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl;
+    //         for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) {
+    //             std::cout << "  To rank " << dest_rank << ": [";
+    //             for (size_t node_gid : node_gids) {
+    //                 std::cout << node_gid << " ";
+    //             }
+    //             std::cout << "]" << std::endl;
+    //         }
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
 
-// Initialize graph comms for elements    
+    // Initialize graph comms for elements    
     // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
     // that efficiently represents the communication pattern between ranks.
     // This allows MPI to optimize communication based on the actual connectivity pattern.
@@ -1579,7 +1640,7 @@ void build_ghost(
     // if(print_info) element_communication_plan.verify_graph_communicator();
 
 
-// Initialize graph comms for nodes    
+    // Initialize graph comms for nodes    
     // ---------- Prepare INCOMING edges (sources) ----------
     // indegree: Number of ranks from which this rank will RECEIVE data
     // These are the ranks that own nodes which are ghosted on this rank
@@ -1603,15 +1664,15 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
     // Optional: Verify the graph communicator was created successfully
-    print_info = true;
-    if(print_info) node_communication_plan.verify_graph_communicator();
-    print_info = false;
+    // print_info = true;
+    // if(print_info) node_communication_plan.verify_graph_communicator();
+    // print_info = false;
 
-// ****************************************************************************************** 
-//     Build send counts and displacements for element communication
-// ****************************************************************************************** 
+    // ****************************************************************************************** 
+    //     Build send counts and displacements for element communication
+    // ****************************************************************************************** 
 
- // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
+    // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
     // For MPI_Neighbor_alltoallv with graph communicator:
     //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
     //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
@@ -1692,16 +1753,16 @@ void build_ghost(
 
 
     // Print out the nodes to send by rank for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        if (rank == r) {
-            std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: ";
-            for (const auto& rank_node_pair : nodes_to_send_by_rank) {
-                std::cout << rank_node_pair.first << " ";
-            }
-            std::cout << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // for (int r = 0; r < world_size; r++) {
+    //     if (rank == r) {
+    //         std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: ";
+    //         for (const auto& rank_node_pair : nodes_to_send_by_rank) {
+    //             std::cout << rank_node_pair.first << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
 
     // Serialize into a DRaggedRightArrayKokkos
@@ -1716,8 +1777,9 @@ void build_ghost(
     for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
         int dest_rank = node_communication_plan.send_rank_ids.host(i);
         for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-            int node_gid = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]);
-            nodes_to_send_by_rank_rr.host(i, j) = node_gid;
+            int node_gid = nodes_to_send_by_rank[dest_rank][j];
+            int node_lid = node_gid_to_extended_lid[node_gid];
+            nodes_to_send_by_rank_rr.host(i, j) = node_lid;
         }
     }
     nodes_to_send_by_rank_rr.update_device();
@@ -1785,7 +1847,9 @@ void build_ghost(
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
         int source_rank = node_communication_plan.recv_rank_ids.host(i);
         for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
-            size_t local_id = nodes_to_recv_by_rank[source_rank][j];
+            size_t node_gid = nodes_to_recv_by_rank[source_rank][j];
+            size_t local_id = node_gid_to_extended_lid[node_gid];
+
             nodes_to_recv_by_rank_rr.host(i, j) = nodes_to_recv_by_rank[source_rank][j];
         }
     }
@@ -1793,43 +1857,43 @@ void build_ghost(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // print the nodes to send by rank rr for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        MPI_Barrier(MPI_COMM_WORLD);
-        if (rank == r) {
-            std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl;
-            for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
-                int dest_rank = node_communication_plan.send_rank_ids.host(i);
-                std::cout << "  To rank " << dest_rank << ": [";
-                for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-                    int global_node_id = nodes_to_send_by_rank[dest_rank][j];
-                    std::cout << global_node_id << " ";
-                }
-                std::cout << "]" << std::endl;
-            }
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // // print the nodes to send by rank rr for each rank sequentially
+    // for (int r = 0; r < world_size; r++) {
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    //     if (rank == r) {
+    //         std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl;
+    //         for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
+    //             int dest_rank = node_communication_plan.send_rank_ids.host(i);
+    //             std::cout << "  To rank " << dest_rank << ": [";
+    //             for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
+    //                 int global_node_id = nodes_to_send_by_rank[dest_rank][j];
+    //                 std::cout << global_node_id << " ";
+    //             }
+    //             std::cout << "]" << std::endl;
+    //         }
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    // MPI_Barrier(MPI_COMM_WORLD);
     
-    // print the nodes to send by rank rr for each rank sequentially
-    for (int r = 0; r < world_size; r++) {
-        if (rank == r) {
-            std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl;
-            for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
-                int source_rank = node_communication_plan.recv_rank_ids.host(i);
-                std::cout << "  From rank " << source_rank << ": [";
-                for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
-                    int node_lid = nodes_to_recv_by_rank[source_rank][j];
-                    size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid);
-                    std::cout << global_node_id << " ";
-                }
-                std::cout << "]" << std::endl;
-            }
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
+    // // print the nodes to send by rank rr for each rank sequentially
+    // for (int r = 0; r < world_size; r++) {
+    //     if (rank == r) {
+    //         std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl;
+    //         for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
+    //             int source_rank = node_communication_plan.recv_rank_ids.host(i);
+    //             std::cout << "  From rank " << source_rank << ": [";
+    //             for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
+    //                 int node_lid = nodes_to_recv_by_rank[source_rank][j];
+    //                 size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid);
+    //                 std::cout << global_node_id << " ";
+    //             }
+    //             std::cout << "]" << std::endl;
+    //         }
+    //     }
+    //     MPI_Barrier(MPI_COMM_WORLD);
+    // }
 
 
 
@@ -1837,7 +1901,7 @@ void build_ghost(
     node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr);
     MPI_Barrier(MPI_COMM_WORLD);
 
-    node_communication_plan.verify_send_recv();
+    // node_communication_plan.verify_send_recv();
 
 }
 
@@ -1897,8 +1961,12 @@ void partition_mesh(
 
 
     // Perform the naive partitioning of the mesh
-    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "Performing the naive partitioning of the mesh" << std::endl;
 
+    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "Naive partitioning of the mesh completed" << std::endl;
 
     /**********************************************************************************
      * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
@@ -2481,9 +2549,12 @@ void partition_mesh(
     CommunicationPlan node_communication_plan;
     node_communication_plan.initialize(MPI_COMM_WORLD);
 
-    build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank);
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting the ghost element and node construction"<<std::endl;
 
+    build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished the ghost element and node construction"<<std::endl;
     
 // ****************************************************************************************** 
 //     Test element communication using MPI_Neighbor_alltoallv
@@ -2542,41 +2613,41 @@ void partition_mesh(
     std::vector<node_state> node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field};
     final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan);
     
-    // for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
-    //     final_node.scalar_field.host(i) = static_cast<double>(rank);
-    //     final_node.vector_field.host(i, 0) = static_cast<double>(rank);
-    //     final_node.vector_field.host(i, 1) = static_cast<double>(rank);
-    //     final_node.vector_field.host(i, 2) = static_cast<double>(rank);
-    // }
-    // for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
-    //     final_node.scalar_field.host(i) = -1.0;
-    //     final_node.vector_field.host(i, 0) = -1.0;
-    //     final_node.vector_field.host(i, 1) = -1.0;
-    //     final_node.vector_field.host(i, 2) = -1.0;
-    // }
+    for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
+        final_node.scalar_field.host(i) = static_cast<double>(rank);
+        final_node.vector_field.host(i, 0) = static_cast<double>(rank);
+        final_node.vector_field.host(i, 1) = static_cast<double>(rank);
+        final_node.vector_field.host(i, 2) = static_cast<double>(rank);
+    }
+    for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
+        final_node.scalar_field.host(i) = -1.0;
+        final_node.vector_field.host(i, 0) = -1.0;
+        final_node.vector_field.host(i, 1) = -1.0;
+        final_node.vector_field.host(i, 2) = -1.0;
+    }
 
-    // final_node.coords.update_device();
-    // final_node.scalar_field.update_device();
-    // final_node.vector_field.update_device();
+    final_node.coords.update_device();
+    final_node.scalar_field.update_device();
+    final_node.vector_field.update_device();
 
-    // final_node.scalar_field.communicate();
-    // // final_node.vector_field.communicate();
-    // MPI_Barrier(MPI_COMM_WORLD);
+    final_node.scalar_field.communicate();
+    // final_node.vector_field.communicate();
+    MPI_Barrier(MPI_COMM_WORLD);
 
 
-    // // Update scalar field to visualize the communication
+    // Update scalar field to visualize the communication
 
-    // for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
-    //     double value = 0.0;
-    //     for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-    //         value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j));
-    //     }
-    //     value /= final_mesh.num_nodes_in_elem;
+    for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
+        double value = 0.0;
+        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
+            value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j));
+        }
+        value /= final_mesh.num_nodes_in_elem;
 
-    //     for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-    //         final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
-    //     }
-    // }
+        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
+            final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
+        }
+    }
 
 
    
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index b2f9f691..5738b21f 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -33,8 +33,8 @@ int main(int argc, char** argv) {
 
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
-    double length[3] = {1.0, 1.0, 0.5};
-    int num_elems_dim[3] = {2, 2, 1};
+    double length[3] = {1.0, 1.0, 1.0};
+    int num_elems_dim[3] = {100, 100, 100};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index c9a75a0f..00f79fb2 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -287,8 +287,8 @@ void build_3d_box(
         const int num_cell_vec_vars    = 0;
         const int num_cell_tensor_vars = 0;
 
-        const int num_point_scalar_vars = 2;
-        const int num_point_vec_vars = 1;
+        const int num_point_scalar_vars = 3;
+        const int num_point_vec_vars = 2;
 
 
         // Scalar values associated with a cell
@@ -301,11 +301,11 @@ void build_3d_box(
         // };
 
         const char point_scalar_var_names[num_point_scalar_vars][15] = {
-            "rank_id", "elems_in_node"
+            "rank_id", "elems_in_node", "scalar_field"
         };
 
         const char point_vec_var_names[num_point_vec_vars][15] = {
-            "pos"
+            "pos", "vector_field"
         };
 
         // short hand
@@ -341,8 +341,14 @@ void build_3d_box(
             vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1);
             vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
 
+            // vector field, var 1
+            vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0);
+            vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1);
+            vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2);
+
             point_scalar_fields(node_gid, 0) = rank;
             point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
+            point_scalar_fields(node_gid, 2) = node.scalar_field.host(node_gid);
 
             if(node_gid == 0) {
                 std::cout << "*******[rank " << rank << "]   - num_corners_in_node: " << mesh.num_corners_in_node(node_gid) << std::endl;
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index 10e58121..b0999049 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -175,6 +175,10 @@ class MPICArrayKokkos {
     // Method to set comm plan for halo communication
     void initialize_comm_plan(CommunicationPlan& comm_plan){
         comm_plan_ = &comm_plan;
+
+        if(comm_plan_->comm_type == communication_plan_type::no_communication){
+            return;
+        }
         
         size_t send_size = comm_plan_->total_send_count * stride_;
         size_t recv_size = comm_plan_->total_recv_count * stride_;

From 524e95005659021329493953baf0ce23a458afe4 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 21 Nov 2025 11:09:59 -0600
Subject: [PATCH 38/52] STYLE: Tidying up

---
 examples/mesh_decomp/decomp_utils.h | 298 +---------------------------
 1 file changed, 11 insertions(+), 287 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 0e7cedff..ac1fd6e9 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -240,9 +240,6 @@ void naive_partition_mesh(
     // ****************************************************************************************** 
     //     Scatter the actual node global ids to each rank
     // ****************************************************************************************** 
-    // Timer: Start measuring time for scattering node global ids
-    double t_scatter_nodeids_start = MPI_Wtime();
-
     if (rank == 0) {
 
         // Prepare data for MPI_Scatterv (scatter with variable counts)
@@ -283,18 +280,12 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the node global ids to each rank"<<std::endl;
-
     // ****************************************************************************************** 
     //     Scatter the node positions to each rank
     // ****************************************************************************************** 
     // Create a flat 1D vector for node positions (3 coordinates per node)
     std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
 
-    // Timer for scattering node positions
-    double t_scatter_nodepos_start = MPI_Wtime();
-
     if(rank == 0)
     {
         for (int i = 0; i < world_size; i++) {
@@ -338,9 +329,6 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the node positions to each rank"<<std::endl;
-
     // ****************************************************************************************** 
     //     Initialize the node state variables
     // ****************************************************************************************** 
@@ -348,7 +336,6 @@ void naive_partition_mesh(
     // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
     std::vector<node_state> required_node_state = { node_state::coords };
     
-    
     naive_node.initialize(num_nodes_on_rank, 3, required_node_state);
 
     for(int i = 0; i < num_nodes_on_rank; i++) {
@@ -359,55 +346,16 @@ void naive_partition_mesh(
 
     naive_node.coords.update_device();
 
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After initializing the node state variables"<<std::endl;
-
-
     // ****************************************************************************************** 
     //     Send the element-node connectivity data from the initial mesh to each rank
     // ****************************************************************************************** 
 
     // Send the element-node connectivity data from the initial mesh to each rank
     std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
-    
-    double t_scatter_elemnode_start = MPI_Wtime();
-    MPI_Barrier(MPI_COMM_WORLD);
-    // if (rank == 0) {
-    //     // Prepare element-node connectivity data for each rank
-    //     std::vector<int> all_nodes_in_elem;
-    //     std::vector<int> sendcounts(world_size);
-    //     std::vector<int> displs(world_size);
-        
-    //     int displacement = 0;
-    //     for(int i = 0; i < world_size; i++) {
-    //         int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element
-    //         sendcounts[i] = num_connectivity_entries;
-    //         displs[i] = displacement;
-            
-    //         // Copy element-node connectivity for rank i
-    //         for(int j = 0; j < elements_to_send[i].size(); j++) {
-    //             for(int k = 0; k < num_nodes_per_elem; k++) {
-    //                 all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
-    //             }
-    //         }
-    //         displacement += num_connectivity_entries;
-    //     }
-    //     // Send the connectivity data to each rank
-    //     MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
-    //                  nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
-    //                  0, MPI_COMM_WORLD);
-    // }
-    // else {
-    //     MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-    //                  nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT,
-    //                  0, MPI_COMM_WORLD);
-    // }
-
-    // MPI_Barrier(MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" before scattering the element-node connectivity data to each rank"<<std::endl;
+  
+
     // Instead of staging a full copy of the connectivity data per-rank, compute the
     // scatter counts/displacements directly from the contiguous global array.
     std::vector<int> conn_sendcounts(world_size);
@@ -420,9 +368,11 @@ void naive_partition_mesh(
     }
 
     // Scatter using the native storage type (size_t) and then convert locally to int
-    size_t* global_nodes_in_elem = (rank == 0)
-        ? initial_mesh.nodes_in_elem.host_pointer()
-        : nullptr;
+    size_t* global_nodes_in_elem = nullptr;
+    if (rank == 0) {
+        global_nodes_in_elem = initial_mesh.nodes_in_elem.host_pointer();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
 
     std::vector<size_t> nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem);
 
@@ -434,12 +384,6 @@ void naive_partition_mesh(
         nodes_in_elem_on_rank[idx] = static_cast<int>(nodes_in_elem_on_rank_size_t[idx]);
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the element-node connectivity data to each rank"<<std::endl;
-
-
     // ****************************************************************************************** 
     //     Send the element-element connectivity data from the initial mesh to each rank
     // ****************************************************************************************** 
@@ -497,11 +441,6 @@ void naive_partition_mesh(
                      num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the num_elems_in_elem for each element on each rank"<<std::endl;
 
     if (rank == 0){
         // Prepare the element-element connectivity data for each rank
@@ -537,10 +476,6 @@ void naive_partition_mesh(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the element-element connectivity data to each rank"<<std::endl;
-
-
     // ****************************************************************************************** 
     //     Initialize the naive_mesh data structures for each rank
     // ****************************************************************************************** 
@@ -669,8 +604,7 @@ void build_ghost(
     //  3. Broadcast connectivity to all ranks (via MPI_Allgatherv)
     //  4. Identify which remote elements touch our local elements
     //  5. Extract the full connectivity data for identified ghost elements
-    double t_ghost_start = MPI_Wtime();
-        
+
     // ========================================================================
     // STEP 1: Gather element ownership information from all ranks
     // ========================================================================
@@ -852,25 +786,8 @@ void build_ghost(
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
-
-    // Print out the ghost elements for each rank sequentially
-    // for (int r = 0; r < world_size; r++) {
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    //     if (rank == r) {
-    //         std::cout << "Rank " << rank << " has the following ghost elements: ";
-    //         for (const auto& elem_gid : ghost_elem_gids) {
-    //             std::cout << elem_gid << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-
     std::map<int, std::set<size_t>> ghost_nodes_from_ranks;
 
-    
-    
     // Iterate through connectivity data from each rank (except ourselves)
     for (int r = 0; r < world_size; r++) {
         if (r == rank) continue;  // Skip our own data - we already know our elements
@@ -928,38 +845,6 @@ void build_ghost(
         }
     }
 
-
-
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // for (int r = 0; r < world_size; r++) {
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    //     if (rank == r) {
-    //         std::cout << "Rank " << rank << " has the following shared nodes: ";
-    //         for (const auto& node_gid : shared_nodes) {
-    //             std::cout << node_gid << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // // Print out the ghost nodes for each rank sequentially
-    // for (int r = 0; r < world_size; r++) {
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    //     if (rank == r) {
-    //         std::cout << "Rank " << rank << " has the following ghost nodes: ";
-    //         for (const auto& node_gid : ghost_node_gids) {
-    //             std::cout << node_gid << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-    // WARNING: HERE IS THE BUG:
-    // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries
-
     // Create a vecor of the ranks that this rank will receive data from for ghost nodes
     std::set<int> ghost_node_receive_ranks;
     for (const auto& pair : ghost_node_recv_rank) {
@@ -969,20 +854,6 @@ void build_ghost(
     std::vector<int> ghost_node_receive_ranks_vec(ghost_node_receive_ranks.begin(), ghost_node_receive_ranks.end());
 
     
-    // Print out the ghost node receive ranks for each rank sequentially
-    // for (int r = 0; r < world_size; r++) {
-    //     if (rank == r) {    
-    //         MPI_Barrier(MPI_COMM_WORLD);
-    //         std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: ";
-    //         for (int r : ghost_node_receive_ranks_vec) {
-    //             std::cout << r << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-    
     // Find which nodes *we own* are ghosted on other ranks, and on which ranks
     // We want: for each of our local nodes, the list of ranks that ghost it
     
@@ -1009,8 +880,6 @@ void build_ghost(
         }
     }
 
-    // WARNING: THE PREVIOUS STEP MUST INCLUDE ALL NODES AFTER MOVING GHOST NODES ONTO THIS RANK, and must be filtered to not include shared ndoes
-
     // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes
     std::set<int> ghost_node_send_ranks;
     for (const auto& pair : local_node_gid_to_ghosting_ranks) {
@@ -1018,15 +887,6 @@ void build_ghost(
     }
     std::vector<int> ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end());
 
-    // std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of local node indices
-    // for (int r = 0; r < world_size; r++) {
-    //     if (r == rank) continue;
-    //     for (size_t node_gid : shared_nodes_on_ranks[r]) {
-    //         int local_node_id = global_to_local_node_mapping[node_gid];
-    //         nodes_to_send_by_rank[r].push_back(local_node_id);
-    //     }
-    // }
-
     // Store the count of ghost elements for later use
     input_mesh.num_ghost_elems = ghost_elem_gids.size();
     input_mesh.num_ghost_nodes = ghost_node_gids.size();
@@ -1109,7 +969,6 @@ void build_ghost(
         }
     }
 
-
     // Assign extended local IDs to ghost-only nodes
     for (size_t node_gid : ghost_only_nodes) {
         node_gid_to_extended_lid[node_gid] = extended_node_lid++;
@@ -1276,25 +1135,9 @@ void build_ghost(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    double t_ghost_end = MPI_Wtime();
-
-    if (rank == 0) {
-        std::cout << " Finished calculating ghost elements" << std::endl;
-        std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl;
-    }
-
     output_mesh.nodes_in_elem.update_device();
     output_mesh.build_connectivity();
 
-
-   
-
-
-
-
-
-
-
     MPI_Barrier(MPI_COMM_WORLD);
 
     if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
@@ -1528,8 +1371,6 @@ void build_ghost(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-
-
     std::map<int, std::set<size_t>> node_set_to_send_by_rank;
 
     // For each owned element that will be ghosted on other ranks,
@@ -1556,23 +1397,6 @@ void build_ghost(
     
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Print out node_set_to_send_by_rank for each rank sequentially
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // for (int r = 0; r < world_size; r++) {
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    //     if (rank == r) {
-    //         std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl;
-    //         for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) {
-    //             std::cout << "  To rank " << dest_rank << ": [";
-    //             for (size_t node_gid : node_gids) {
-    //                 std::cout << node_gid << " ";
-    //             }
-    //             std::cout << "]" << std::endl;
-    //         }
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
     std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of global node indices
 
     // Copy the node_set_to_send_by_rank map to nodes_to_send_by_rank
@@ -1582,24 +1406,6 @@ void build_ghost(
         }
     }
 
-    // // Print out nodes_to_send_by_rank for each rank sequentially
-    // MPI_Barrier(MPI_COMM_WORLD);
-    // for (int r = 0; r < world_size; r++) {
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    //     if (rank == r) {
-    //         std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl;
-    //         for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) {
-    //             std::cout << "  To rank " << dest_rank << ": [";
-    //             for (size_t node_gid : node_gids) {
-    //                 std::cout << node_gid << " ";
-    //             }
-    //             std::cout << "]" << std::endl;
-    //         }
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-
     // Initialize graph comms for elements    
     // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
     // that efficiently represents the communication pattern between ranks.
@@ -1663,11 +1469,6 @@ void build_ghost(
     node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources);
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // Optional: Verify the graph communicator was created successfully
-    // print_info = true;
-    // if(print_info) node_communication_plan.verify_graph_communicator();
-    // print_info = false;
-
     // ****************************************************************************************** 
     //     Build send counts and displacements for element communication
     // ****************************************************************************************** 
@@ -1751,20 +1552,6 @@ void build_ghost(
     // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
     // --------------------------------------------------------------------------------------
 
-
-    // Print out the nodes to send by rank for each rank sequentially
-    // for (int r = 0; r < world_size; r++) {
-    //     if (rank == r) {
-    //         std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: ";
-    //         for (const auto& rank_node_pair : nodes_to_send_by_rank) {
-    //             std::cout << rank_node_pair.first << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-
     // Serialize into a DRaggedRightArrayKokkos
     CArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks);
     for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
@@ -1784,8 +1571,6 @@ void build_ghost(
     }
     nodes_to_send_by_rank_rr.update_device();
 
-
-
     // For each ghost element, determine which nodes need to be received from the owning rank
     // Build the receive list based on ghost element nodes, not on ghost_node_gids
     // This ensures we receive all nodes needed by ghost elements
@@ -1857,47 +1642,6 @@ void build_ghost(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    // // print the nodes to send by rank rr for each rank sequentially
-    // for (int r = 0; r < world_size; r++) {
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    //     if (rank == r) {
-    //         std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl;
-    //         for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
-    //             int dest_rank = node_communication_plan.send_rank_ids.host(i);
-    //             std::cout << "  To rank " << dest_rank << ": [";
-    //             for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
-    //                 int global_node_id = nodes_to_send_by_rank[dest_rank][j];
-    //                 std::cout << global_node_id << " ";
-    //             }
-    //             std::cout << "]" << std::endl;
-    //         }
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-    // MPI_Barrier(MPI_COMM_WORLD);
-    
-    // // print the nodes to send by rank rr for each rank sequentially
-    // for (int r = 0; r < world_size; r++) {
-    //     if (rank == r) {
-    //         std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl;
-    //         for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
-    //             int source_rank = node_communication_plan.recv_rank_ids.host(i);
-    //             std::cout << "  From rank " << source_rank << ": [";
-    //             for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
-    //                 int node_lid = nodes_to_recv_by_rank[source_rank][j];
-    //                 size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid);
-    //                 std::cout << global_node_id << " ";
-    //             }
-    //             std::cout << "]" << std::endl;
-    //         }
-    //     }
-    //     MPI_Barrier(MPI_COMM_WORLD);
-    // }
-
-
-
-
     node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr);
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -2207,9 +1951,6 @@ void partition_mesh(
     // Other topology options could be substituted above according to your needs (see docs).
     SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
 
-
-
-    
     // ===================== PT-Scotch Strategy Selection and Documentation ======================
     // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
     // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
@@ -2348,10 +2089,6 @@ void partition_mesh(
     // New elements owned by this rank
     int num_new_elems = static_cast<int>(new_elem_gids.size());
     
-    if (print_info) {
-        std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl;
-    }
-
     // -------------- Phase 3: Send element–node connectivity --------------
     int nodes_per_elem = naive_mesh.num_nodes_in_elem;
 
@@ -2472,13 +2209,9 @@ void partition_mesh(
     intermediate_mesh.local_to_global_node_mapping.update_device();
     intermediate_mesh.local_to_global_elem_mapping.update_device();
 
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
     // rebuild the local element-node connectivity using the local node ids
     for(int i = 0; i < intermediate_mesh.num_elems; i++) {
         for(int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
-
             int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j];
 
             int node_lid = -1;
@@ -2501,9 +2234,6 @@ void partition_mesh(
         }
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
-
     intermediate_mesh.nodes_in_elem.update_device();
 
     // Fill node coordinates
@@ -2545,7 +2275,6 @@ void partition_mesh(
     CommunicationPlan element_communication_plan;
     element_communication_plan.initialize(MPI_COMM_WORLD);
     
-    
     CommunicationPlan node_communication_plan;
     node_communication_plan.initialize(MPI_COMM_WORLD);
 
@@ -2553,9 +2282,11 @@ void partition_mesh(
     if(rank == 0) std::cout<<" Starting the ghost element and node construction"<<std::endl;
 
     build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank);
+    
     MPI_Barrier(MPI_COMM_WORLD);
     if(rank == 0) std::cout<<" Finished the ghost element and node construction"<<std::endl;
     
+
 // ****************************************************************************************** 
 //     Test element communication using MPI_Neighbor_alltoallv
 // ****************************************************************************************** 
@@ -2648,13 +2379,6 @@ void partition_mesh(
             final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
         }
     }
-
-
-   
-
 }
 
-
-
-
-#endif
\ No newline at end of file
+#endif // DECOMP_UTILS_H
\ No newline at end of file

From 1e69ffb4b16b4243b930b6ccaaa6fbe8820d1d49 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 21 Nov 2025 13:07:03 -0600
Subject: [PATCH 39/52] ENH: Parallelize mesh builder, tidy

---
 examples/mesh_decomp/mesh_decomp.cpp |   4 -
 examples/mesh_decomp/mesh_io.h       | 149 ++++++++++++++++++---------
 2 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 5738b21f..9663b306 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -58,10 +58,6 @@ int main(int argc, char** argv) {
         std::cout<<"Initializing mesh"<<std::endl;
         build_3d_box(initial_mesh,  initial_node, origin, length, num_elems_dim);
 
-        
-
-        // write_vtk(initial_mesh, initial_node, rank);
-
         // Read the mesh from a file
         // read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk");
 
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 00f79fb2..1a043ca1 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -67,10 +67,11 @@ inline std::vector<std::string> split(std::string s, std::string delimiter)
 /// \param Number of j indices
 ///
 /////////////////////////////////////////////////////////////////////////////
-inline int get_id(int i, int j, int k, int num_i, int num_j)
+KOKKOS_INLINE_FUNCTION
+size_t get_id(int i, int j, int k, int num_i, int num_j)
 {
     return i + j * num_i + k * num_i * num_j;
-}
+} // end get_id
 
 /////////////////////////////////////////////////////////////////////////////
 ///
@@ -189,20 +190,40 @@ void build_3d_box(
 
     // --- Build nodes ---
 
+    CArrayDual<double> origin_mtr(3, "origin_mtr");
+    origin_mtr(0) = origin[0];
+    origin_mtr(1) = origin[1];
+    origin_mtr(2) = origin[2];
+    origin_mtr.update_device();
+
+    // populate the point data structures
+    FOR_ALL(k, 0, num_points_k,
+            j, 0, num_points_j,
+            i, 0, num_points_i,{
+
+        // global id for the point
+        size_t node_gid = get_id(i, j, k, num_points_i, num_points_j);
+
+        // store the point coordinates
+        node.coords.host(node_gid, 0) = origin_mtr(0) + (double)i * dx;
+        node.coords.host(node_gid, 1) = origin_mtr(1) + (double)j * dy;
+        node.coords.host(node_gid, 2) = origin_mtr(2) + (double)k * dz;
+    });
+
     // populate the point data structures
-    for (int k = 0; k < num_points_k; k++) {
-        for (int j = 0; j < num_points_j; j++) {
-            for (int i = 0; i < num_points_i; i++) {
-                // global id for the point
-                int node_gid = get_id(i, j, k, num_points_i, num_points_j);
+    // for (int k = 0; k < num_points_k; k++) {
+    //     for (int j = 0; j < num_points_j; j++) {
+    //         for (int i = 0; i < num_points_i; i++) {
+    //             // global id for the point
+    //             int node_gid = get_id(i, j, k, num_points_i, num_points_j);
 
-                // store the point coordinates
-                node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
-                node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
-                node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
-            } // end for i
-        } // end for j
-    } // end for k
+    //             // store the point coordinates
+    //             node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
+    //             node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
+    //             node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
+    //         } // end for i
+    //     } // end for j
+    // } // end for k
 
 
     node.coords.update_device();
@@ -212,43 +233,75 @@ void build_3d_box(
 
     // --- Build elems  ---
 
-    // populate the elem center data structures
-    for (int k = 0; k < num_elems_k; k++) {
-        for (int j = 0; j < num_elems_j; j++) {
-            for (int i = 0; i < num_elems_i; i++) {
-                // global id for the elem
-                int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
-
-                // store the point IDs for this elem where the range is
-                // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
-                int this_point = 0;
-                for (int kcount = k; kcount <= k + 1; kcount++) {
-                    for (int jcount = j; jcount <= j + 1; jcount++) {
-                        for (int icount = i; icount <= i + 1; icount++) {
-                            // global id for the points
-                            int node_gid = get_id(icount, jcount, kcount,
-                                                num_points_i, num_points_j);
-
-                            // convert this_point index to the FE index convention
-                            int this_index = this_point; //convert_point_number_in_Hex(this_point);
-
-                            // store the points in this elem according the the finite
-                            // element numbering convention
-                            mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
-
-                            // increment the point counting index
-                            this_point = this_point + 1;
-                        } // end for icount
-                    } // end for jcount
-                }  // end for kcount
-            } // end for i
-        } // end for j
-    } // end for k
+    // // populate the elem center data structures
+    // for (int k = 0; k < num_elems_k; k++) {
+    //     for (int j = 0; j < num_elems_j; j++) {
+    //         for (int i = 0; i < num_elems_i; i++) {
+                
+    //             // global id for the elem
+    //             int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
+
+    //             // store the point IDs for this elem where the range is
+    //             // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
+    //             int this_point = 0;
+    //             for (int kcount = k; kcount <= k + 1; kcount++) {
+    //                 for (int jcount = j; jcount <= j + 1; jcount++) {
+    //                     for (int icount = i; icount <= i + 1; icount++) {
+    //                         // global id for the points
+    //                         int node_gid = get_id(icount, jcount, kcount,
+    //                                             num_points_i, num_points_j);
+
+    //                         // convert this_point index to the FE index convention
+    //                         int this_index = this_point; //convert_point_number_in_Hex(this_point);
+
+    //                         // store the points in this elem according the the finite
+    //                         // element numbering convention
+    //                         mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+
+    //                         // increment the point counting index
+    //                         this_point = this_point + 1;
+    //                     } // end for icount
+    //                 } // end for jcount
+    //             }  // end for kcount
+    //         } // end for i
+    //     } // end for j
+    // } // end for k
+
+    // populate the point data structures
+    FOR_ALL(k, 0, num_elems_k,
+            j, 0, num_elems_j,
+            i, 0, num_elems_i,{
+
+        // global id for the elem
+        size_t elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
+
+        // store the point IDs for this elem where the range is
+        // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
+        int this_point = 0;
+        for (int kcount = k; kcount <= k + 1; kcount++) {
+            for (int jcount = j; jcount <= j + 1; jcount++) {
+                for (int icount = i; icount <= i + 1; icount++) {
+                    // global id for the points
+                    size_t node_gid = get_id(icount, jcount, kcount,
+                                        num_points_i, num_points_j);
+
+                    // convert this_point index to the FE index convention
+                    int this_index = this_point; //convert_point_number_in_Hex(this_point);
+
+                    // store the points in this elem according the the finite
+                    // element numbering convention
+                    mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+
+                    // increment the point counting index
+                    this_point++;
+                } // end for icount
+            } // end for jcount
+        }  // end for kcount
+    }); // end parallel for
 
     // update device side
     mesh.nodes_in_elem.update_device();
-
-
+    Kokkos::fence();
 
     // Build connectivity
     mesh.build_connectivity();

From b0a1924793343e237e1bf055c809319ed47d35de Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Fri, 21 Nov 2025 16:08:08 -0600
Subject: [PATCH 40/52] STYLE: Tidying up, and reducing memory overhead

---
 examples/mesh_decomp/decomp_utils.h  | 193 +++++++++++----------------
 examples/mesh_decomp/mesh_decomp.cpp |   2 +-
 examples/mesh_decomp/mesh_io.h       |  69 ++--------
 3 files changed, 91 insertions(+), 173 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index ac1fd6e9..18a3508a 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -51,8 +51,8 @@ void naive_partition_mesh(
     node_t& initial_node,
     Mesh_t& naive_mesh,
     node_t& naive_node,
-    std::vector<int>& elems_in_elem_on_rank,
-    std::vector<int>& num_elems_in_elem_per_rank,
+    CArrayDual<int>& elems_in_elem_on_rank,
+    CArrayDual<int>& num_elems_in_elem_per_rank,
     int world_size,
     int rank)
 {
@@ -61,25 +61,12 @@ void naive_partition_mesh(
 
     int num_elements_on_rank = 0;
     int num_nodes_on_rank = 0;
-
     int num_nodes_per_elem = 0;
-
-    
-    std::vector<int> nodes_on_rank;
+    int num_dim = initial_mesh.num_dims;
 
 
+    // Compute the number of elements to send to each rank and num_nodes_per_elem
     std::vector<int> elems_per_rank(world_size); // number of elements to send to each rank size(world_size)
-    std::vector<int> nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size)
-
-    // create a 2D vector of elements to send to each rank
-    std::vector<std::vector<int>> elements_to_send(world_size);
-
-    // create a 2D vector of nodes to send to each rank
-    std::vector<std::vector<int>> nodes_to_send(world_size);
-
-    // Create a 2D vector to hold the nodal positions on each rank
-    std::vector<std::vector<double>> node_pos_to_send(world_size);
-
     if (rank == 0) {
 
         num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
@@ -107,9 +94,7 @@ void naive_partition_mesh(
     //             int root, MPI_Comm comm)
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" Starting the scatter operation for the element counts per rank"<<std::endl;
 
-    double t_scatter_start = MPI_Wtime();
     MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
                 &num_elements_on_rank, 1, MPI_INT, 
                 0, MPI_COMM_WORLD);
@@ -119,21 +104,19 @@ void naive_partition_mesh(
 
     // Vector of element to send to each rank using a naive partitioning (0-m, m-n, n-o, etc.)
     std::vector<int> elements_on_rank(num_elements_on_rank);  
-    MPI_Barrier(MPI_COMM_WORLD);
-    double t_scatter_end = MPI_Wtime();
+
 
     // ********************************************************  
     //     Scatter the actual element global ids to each rank
     // ******************************************************** 
-    double t_scatter_gids_start = MPI_Wtime();
 
+    // create a 2D vector of elements to send to each rank
+    std::vector<std::vector<int>> elements_to_send(world_size);
     if (rank == 0) {
 
         // Populate the elements_to_send array by finding all elements in the elements_per_rank array and adding them to the elements_to_send array
-    
         int elem_gid = 0;
         for (int rank = 0; rank < world_size; rank++) {
-
             for (int j = 0; j < elems_per_rank[rank]; j++) {
                 elements_to_send[rank].push_back(elem_gid);
                 elem_gid++;
@@ -181,12 +164,13 @@ void naive_partition_mesh(
     // Wait for all ranks to complete the scatter operation
     MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering element counts per rank"<<std::endl;
-
     // ****************************************************************************************** 
     //     Scatter the number of nodes to each rank and compute which nodes to send to each rank
     // ****************************************************************************************** 
+    std::vector<int> nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size)
+    std::vector<int> nodes_on_rank; // node gids the current rank
+    std::vector<std::vector<int>> nodes_to_send(world_size); // nodes to send to each rank
+
     if (rank == 0) {
 
         // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
@@ -203,40 +187,17 @@ void naive_partition_mesh(
         for (int i = 0; i < world_size; i++) {
             nodes_per_rank[i] = nodes_to_send[i].size();
         }
-
-        if (print_info) {
-            std::cout<<std::endl;
-            // print the nodes_to_send array
-            for (int i = 0; i < world_size; i++) {
-
-                std::cout<<std::endl;
-                std::cout<<"Rank "<<i<<" will get "<<nodes_to_send[i].size()<<" nodes: ";
-
-                for (int j = 0; j < nodes_to_send[i].size(); j++) {
-                    std::cout<<nodes_to_send[i][j]<<" ";
-                }
-                std::cout<<std::endl;
-            }
-        }
     }
 
     // Send the number of nodes to each rank using MPI_scatter
     MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
-
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the number of nodes to each rank"<<std::endl;
 
     // resize the nodes_on_rank vector to hold the received data
     nodes_on_rank.resize(num_nodes_on_rank);
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-    if (print_info) {
-        std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl;
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-
     // ****************************************************************************************** 
     //     Scatter the actual node global ids to each rank
     // ****************************************************************************************** 
@@ -283,19 +244,11 @@ void naive_partition_mesh(
     // ****************************************************************************************** 
     //     Scatter the node positions to each rank
     // ****************************************************************************************** 
-    // Create a flat 1D vector for node positions (3 coordinates per node)
-    std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * 3);
-
-    if(rank == 0)
-    {
-        for (int i = 0; i < world_size; i++) {
-            for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++)
-            {
-                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0));
-                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1));
-                node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2));
-            }
-        }
+    // Create a flat 1D vector for node positions (num_dim coordinates per node)
+    std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * num_dim);
+    CArrayDual<double> node_pos_on_rank(num_nodes_on_rank, num_dim, "node_pos_on_rank_decomp");
+    
+    if(rank == 0){
 
         // Prepare data for MPI_Scatterv (scatter with variable counts)
         // Flatten the 2D node_pos_to_send into a 1D array
@@ -305,29 +258,30 @@ void naive_partition_mesh(
         
         int displacement = 0;
         for (int i = 0; i < world_size; i++) {
-            sendcounts[i] = nodes_to_send[i].size() * 3;
+            sendcounts[i] = nodes_to_send[i].size() * num_dim;
             displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array
             // Copy node positions for rank i to the flattened array
-            for(int j = 0; j < nodes_to_send[i].size(); j++) {
-                for(int k = 0; k < 3; k++) {
-                    all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]);
+            for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) {
+                for(int dim = 0; dim < num_dim; dim++) {
+                    all_node_pos.push_back(initial_node.coords.host(nodes_to_send[i][node_gid], dim));
                 }
             }
-            displacement += nodes_to_send[i].size() * 3;
+            displacement += nodes_to_send[i].size() * num_dim;
         }   
 
         // Send the node positions to each rank
         MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE,
-                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
+                     node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE,
                      0, MPI_COMM_WORLD);
     }
     else {
         MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE,
-                     node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE,
+                     node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE,
                      0, MPI_COMM_WORLD);
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
+    node_pos_on_rank.update_device();
 
     // ****************************************************************************************** 
     //     Initialize the node state variables
@@ -335,16 +289,14 @@ void naive_partition_mesh(
 
     // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
     std::vector<node_state> required_node_state = { node_state::coords };
-    
-    naive_node.initialize(num_nodes_on_rank, 3, required_node_state);
+    naive_node.initialize(num_nodes_on_rank, num_dim, required_node_state);
 
-    for(int i = 0; i < num_nodes_on_rank; i++) {
-        naive_node.coords.host(i, 0) = node_pos_on_rank_flat[i*3];
-        naive_node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1];
-        naive_node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2];
-    }
+    FOR_ALL(node_id, 0, num_nodes_on_rank,
+            dim, 0, num_dim,{
+        naive_node.coords(node_id, dim) = node_pos_on_rank(node_id, dim);
+    });
 
-    naive_node.coords.update_device();
+    naive_node.coords.update_host();
 
     // ****************************************************************************************** 
     //     Send the element-node connectivity data from the initial mesh to each rank
@@ -374,14 +326,17 @@ void naive_partition_mesh(
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
-    std::vector<size_t> nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem);
 
-    MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG,
-                 nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG,
-                 0, MPI_COMM_WORLD);
+    { //scope to free memory for tmp vector
+        std::vector<size_t> nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem);
+
+        MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG,
+                    nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG,
+                    0, MPI_COMM_WORLD);
 
-    for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) {
-        nodes_in_elem_on_rank[idx] = static_cast<int>(nodes_in_elem_on_rank_size_t[idx]);
+        for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) {
+            nodes_in_elem_on_rank[idx] = static_cast<int>(nodes_in_elem_on_rank_size_t[idx]);
+        }
     }
 
     // ****************************************************************************************** 
@@ -390,12 +345,10 @@ void naive_partition_mesh(
 
     // First, rank 0 computes how many connectivity entries each rank will receive
     // and scatters that information
-    std::vector<int> elem_elem_counts(world_size);
     int total_elem_elem_entries = 0;
-    
-    
-    double t_scatter_elem_elem_start = MPI_Wtime();
 
+    std::vector<int> elem_elem_counts(world_size);
+    
     if (rank == 0){
         // Calculate total number of connectivity entries for each rank
         for(int i = 0; i < world_size; i++) {
@@ -413,12 +366,11 @@ void naive_partition_mesh(
                 0, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<" After scattering the element-element connectivity data to each rank"<<std::endl;
 
-    elems_in_elem_on_rank.resize(total_elem_elem_entries);
-    
+    elems_in_elem_on_rank = CArrayDual<int>(total_elem_elem_entries, "elems_in_elem_on_rank");
+
     // Now scatter the num_elems_in_elem for each element on each rank
-    num_elems_in_elem_per_rank.resize(num_elements_on_rank);
+    num_elems_in_elem_per_rank = CArrayDual<int>(num_elements_on_rank, "num_elems_in_elem_per_rank");
     
     if (rank == 0) {
         std::vector<int> all_num_elems_in_elem;
@@ -434,14 +386,16 @@ void naive_partition_mesh(
         }
         
         MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT,
-                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
+                     num_elems_in_elem_per_rank.host_pointer(), num_elements_on_rank, MPI_INT,
                      0, MPI_COMM_WORLD);
     } else {
         MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT,
+                     num_elems_in_elem_per_rank.host_pointer(), num_elements_on_rank, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
 
+    num_elems_in_elem_per_rank.update_device();
+
     if (rank == 0){
         // Prepare the element-element connectivity data for each rank
         std::vector<int> all_elems_in_elem;
@@ -465,22 +419,24 @@ void naive_partition_mesh(
 
         // Send the element-element connectivity data to each rank using MPI_Scatterv
         MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
-                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
+                     elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
     else {
         MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
-                     elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT,
+                     elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT,
                      0, MPI_COMM_WORLD);
     }
 
+    elems_in_elem_on_rank.update_device();
+
     MPI_Barrier(MPI_COMM_WORLD);
 
     // ****************************************************************************************** 
     //     Initialize the naive_mesh data structures for each rank
     // ****************************************************************************************** 
     naive_mesh.initialize_nodes(num_nodes_on_rank);
-    naive_mesh.initialize_elems(num_elements_on_rank, 3);
+    naive_mesh.initialize_elems(num_elements_on_rank, num_dim);
 
     naive_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_nodes_on_rank, "naive_mesh.local_to_global_node_mapping");
     naive_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_elements_on_rank, "naive_mesh.local_to_global_elem_mapping");
@@ -504,12 +460,11 @@ void naive_partition_mesh(
     // rebuild the local element-node connectivity using the local node ids
     for(int i = 0; i < num_elements_on_rank; i++) {
         for(int j = 0; j < num_nodes_per_elem; j++) {
-
             int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j];
 
             int node_lid = -1;
 
-            // Use binary search to find the local node index for node_gid
+            // Use binary search to find the local node index for node_gid, local_to_global_node_mapping is sorted
             int left = 0, right = num_nodes_on_rank - 1;
             while (left <= right) {
                 int mid = left + (right - left) / 2;
@@ -541,7 +496,6 @@ void naive_partition_mesh(
     // ****************************************************************************************** 
     //     Build the connectivity for the local naive_mesh
     // ****************************************************************************************** 
-
     naive_mesh.build_connectivity();
     MPI_Barrier(MPI_COMM_WORLD);
     
@@ -1689,6 +1643,8 @@ void partition_mesh(
     bool print_info = false;
     bool print_vtk = false;
 
+    int num_dim = initial_mesh.num_dims;
+
     // Create mesh, gauss points, and node data structures on each rank
     // This is the initial partitioned mesh
     Mesh_t naive_mesh;
@@ -1698,19 +1654,17 @@ void partition_mesh(
     Mesh_t intermediate_mesh; 
     node_t intermediate_node;
 
-
     // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh
-    std::vector<int> elems_in_elem_on_rank;
-    std::vector<int> num_elems_in_elem_per_rank;
+    CArrayDual<int> elems_in_elem_on_rank;
+    CArrayDual<int> num_elems_in_elem_per_rank;
 
 
     // Perform the naive partitioning of the mesh
     MPI_Barrier(MPI_COMM_WORLD);
     if (rank == 0) std::cout << "Performing the naive partitioning of the mesh" << std::endl;
-
     naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
     MPI_Barrier(MPI_COMM_WORLD);
-    if (rank == 0) std::cout << "Naive partitioning of the mesh completed" << std::endl;
+    if (rank == 0) std::cout << "Begin repartitioning using PT-Scotch" << std::endl;
 
     /**********************************************************************************
      * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
@@ -1791,7 +1745,22 @@ void partition_mesh(
 
     // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
     std::vector<SCOTCH_Num> edgeloctab;
-    edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance
+    // edgeloctab holds the flattened list of all neighbors (edges) for all local elements,
+    // in a compact CSR (Compressed Sparse Row) format expected by PT-Scotch. Each entry is a global element ID
+    // of a neighbor. The edgeloctab array is built incrementally with one entry per element neighbor edge,
+    // so we reserve its capacity up front for efficiency.
+    //
+    // Heuristic: For unstructured 3D hexahedral meshes, a single element can have significantly more neighbors 
+    // than in 2D cases. In a fully structured 3D grid, each hexahedral element can have up to 26 neighbors 
+    // (since it may touch all surrounding elements along all axes). In unstructured grids, it's possible for some 
+    // elements to have even more neighbors due to mesh irregularities and refinements. 
+    // 
+    // For most practical unstructured hexahedral meshes, values in the low 20s are common, but extreme cases 
+    // (e.g., high-order connectivity, pathological splits, or meshes with "hanging nodes") may see higher counts. 
+    // Using vertlocnbr * 26 as an upper limit is a reasonable estimate for fully connected (structured) cases, 
+    // but consider increasing this if working with highly unstructured or pathological meshes. For safety and 
+    // to avoid repeated reallocations during construction, we use 26 here as a conservative guess.
+    edgeloctab.reserve(vertlocnbr * 26);
 
     // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
     // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
@@ -1800,7 +1769,7 @@ void partition_mesh(
     for (size_t k = 0; k < naive_mesh.num_elems; k++) {
         int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
         elem_gid_to_offset[elem_gid_on_rank] = current_offset;
-        current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH
+        current_offset += num_elems_in_elem_per_rank(k); 
     }
 
     // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
@@ -1827,11 +1796,11 @@ void partition_mesh(
                 break;
             }
         }
-        size_t num_nbrs = num_elems_in_elem_per_rank[idx];
+        size_t num_nbrs = num_elems_in_elem_per_rank(idx);
 
         // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
         for (size_t j = 0; j < num_nbrs; j++) {
-            size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID!
+            size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID!
             edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
             ++offset; // Increment running edge count
         }
@@ -2162,9 +2131,9 @@ void partition_mesh(
                 int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
                 int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
 
-                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0));
-                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1));
-                node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2));
+                for(int dim = 0; dim < num_dim; dim++) {
+                    node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, dim));
+                }
             }
         }
     }
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 9663b306..a5de7a8b 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {100, 100, 100};
+    int num_elems_dim[3] = {200, 200, 200};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 1a043ca1..a3530a07 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -205,68 +205,16 @@ void build_3d_box(
         size_t node_gid = get_id(i, j, k, num_points_i, num_points_j);
 
         // store the point coordinates
-        node.coords.host(node_gid, 0) = origin_mtr(0) + (double)i * dx;
-        node.coords.host(node_gid, 1) = origin_mtr(1) + (double)j * dy;
-        node.coords.host(node_gid, 2) = origin_mtr(2) + (double)k * dz;
+        node.coords(node_gid, 0) = origin_mtr(0) + (double)i * dx;
+        node.coords(node_gid, 1) = origin_mtr(1) + (double)j * dy;
+        node.coords(node_gid, 2) = origin_mtr(2) + (double)k * dz;
     });
-
-    // populate the point data structures
-    // for (int k = 0; k < num_points_k; k++) {
-    //     for (int j = 0; j < num_points_j; j++) {
-    //         for (int i = 0; i < num_points_i; i++) {
-    //             // global id for the point
-    //             int node_gid = get_id(i, j, k, num_points_i, num_points_j);
-
-    //             // store the point coordinates
-    //             node.coords.host(node_gid, 0) = origin[0] + (double)i * dx;
-    //             node.coords.host(node_gid, 1) = origin[1] + (double)j * dy;
-    //             node.coords.host(node_gid, 2) = origin[2] + (double)k * dz;
-    //         } // end for i
-    //     } // end for j
-    // } // end for k
-
-
-    node.coords.update_device();
+    // Update the host side
+    node.coords.update_host();
 
     // initialize elem variables
     mesh.initialize_elems(num_elems, num_dim);
 
-    // --- Build elems  ---
-
-    // // populate the elem center data structures
-    // for (int k = 0; k < num_elems_k; k++) {
-    //     for (int j = 0; j < num_elems_j; j++) {
-    //         for (int i = 0; i < num_elems_i; i++) {
-                
-    //             // global id for the elem
-    //             int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
-
-    //             // store the point IDs for this elem where the range is
-    //             // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
-    //             int this_point = 0;
-    //             for (int kcount = k; kcount <= k + 1; kcount++) {
-    //                 for (int jcount = j; jcount <= j + 1; jcount++) {
-    //                     for (int icount = i; icount <= i + 1; icount++) {
-    //                         // global id for the points
-    //                         int node_gid = get_id(icount, jcount, kcount,
-    //                                             num_points_i, num_points_j);
-
-    //                         // convert this_point index to the FE index convention
-    //                         int this_index = this_point; //convert_point_number_in_Hex(this_point);
-
-    //                         // store the points in this elem according the the finite
-    //                         // element numbering convention
-    //                         mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
-
-    //                         // increment the point counting index
-    //                         this_point = this_point + 1;
-    //                     } // end for icount
-    //                 } // end for jcount
-    //             }  // end for kcount
-    //         } // end for i
-    //     } // end for j
-    // } // end for k
-
     // populate the point data structures
     FOR_ALL(k, 0, num_elems_k,
             j, 0, num_elems_j,
@@ -290,7 +238,7 @@ void build_3d_box(
 
                     // store the points in this elem according the the finite
                     // element numbering convention
-                    mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid;
+                    mesh.nodes_in_elem(elem_gid, this_index) = node_gid;
 
                     // increment the point counting index
                     this_point++;
@@ -299,8 +247,9 @@ void build_3d_box(
         }  // end for kcount
     }); // end parallel for
 
-    // update device side
-    mesh.nodes_in_elem.update_device();
+    // Update the host side
+    mesh.nodes_in_elem.update_host();
+
     Kokkos::fence();
 
     // Build connectivity

From 052aa7c12e7909b1deb47dc6f41b58096c01d3c1 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 11:30:42 -0600
Subject: [PATCH 41/52] ENH: Adding GPU safety, WIP

---
 examples/mesh_decomp/decomp_utils.h  | 67 ++++++++++++++++++++++------
 examples/mesh_decomp/mesh_decomp.cpp |  2 +-
 2 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 18a3508a..617d4014 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -350,11 +350,18 @@ void naive_partition_mesh(
     std::vector<int> elem_elem_counts(world_size);
     
     if (rank == 0){
+
+        DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
+        FOR_ALL(i, 0, initial_mesh.num_elems, {
+            tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
+        });
+        tmp_num_elems_in_elem.update_host();
+        MATAR_FENCE();
         // Calculate total number of connectivity entries for each rank
         for(int i = 0; i < world_size; i++) {
             elem_elem_counts[i] = 0;
             for(int k = 0; k < elements_to_send[i].size(); k++) {
-                elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]);
+                elem_elem_counts[i] += tmp_num_elems_in_elem.host(elements_to_send[i][k]);
             }
         }
     }
@@ -366,6 +373,7 @@ void naive_partition_mesh(
                 0, MPI_COMM_WORLD);
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout<< " Finished scatter" <<std::endl;
 
     elems_in_elem_on_rank = CArrayDual<int>(total_elem_elem_entries, "elems_in_elem_on_rank");
 
@@ -376,12 +384,24 @@ void naive_partition_mesh(
         std::vector<int> all_num_elems_in_elem;
         std::vector<int> displs_ee(world_size);
         int displacement = 0;
+
+        DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
+        FOR_ALL(i, 0, initial_mesh.num_elems, {
+            tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
+        });
+        tmp_num_elems_in_elem.update_host();
+        MATAR_FENCE();
         
         for(int i = 0; i < world_size; i++) {
             displs_ee[i] = displacement;
+
+            std::cout<< "Rank = "<< i <<std::endl;
+
             for(int k = 0; k < elements_to_send[i].size(); k++) {
-                all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k]));
+                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem(elements_to_send[i][k]));
             }
+
+            std::cout<< " Finished all_num_elem_elem" <<std::endl;
             displacement += elements_to_send[i].size();
         }
         
@@ -397,12 +417,32 @@ void naive_partition_mesh(
     num_elems_in_elem_per_rank.update_device();
 
     if (rank == 0){
+
+        std::cout<<"Sending connectivity"<<std::endl;
         // Prepare the element-element connectivity data for each rank
         std::vector<int> all_elems_in_elem;
         std::vector<int> sendcounts(world_size);
         std::vector<int> displs(world_size);
         
         int displacement = 0;
+
+        DRaggedRightArrayKokkos<size_t> tmp_elems_in_elem(initial_mesh.num_elems_in_elem, "temp_elem_in_elem");
+
+        FOR_ALL(elem_gid, 0, initial_mesh.num_elems, {
+            for (size_t i = 0; i < initial_mesh.num_elems_in_elem(elem_gid); i++) {
+                tmp_elems_in_elem(elem_gid, i) = initial_mesh.elems_in_elem(elem_gid, i);
+            } // end for i
+        });  // end FOR_ALL elems
+        tmp_elems_in_elem.update_host();
+        Kokkos::fence();
+
+
+        DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
+        FOR_ALL(i, 0, initial_mesh.num_elems, {
+            tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
+        });
+        tmp_num_elems_in_elem.update_host();
+        MATAR_FENCE();
         
         for(int i = 0; i < world_size; i++) {
             sendcounts[i] = elem_elem_counts[i];
@@ -410,8 +450,8 @@ void naive_partition_mesh(
             
             // Copy element-element connectivity for rank i
             for(int k = 0; k < elements_to_send[i].size(); k++) {
-                for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) {
-                    all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l));
+                for(int l = 0; l < tmp_num_elems_in_elem.host(elements_to_send[i][k]); l++) {
+                    all_elems_in_elem.push_back(tmp_elems_in_elem.host(elements_to_send[i][k], l));
                 }
             }
             displacement += elem_elem_counts[i];
@@ -2287,16 +2327,15 @@ void partition_mesh(
     gauss_point.fields_vec.communicate();
     
     // Loop over all elements and average the values of elements connected to that element
-    for (int i = 0; i < final_mesh.num_elems; i++) {
+    FOR_ALL(i, 0, final_mesh.num_elems, {
         double value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
             value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j));
         }
         value /= final_mesh.num_elems_in_elem(i);
         gauss_point.fields.host(i) = value;
-    }
-    for (int i = 0; i < final_mesh.num_elems; i++) {
-        double value = 0.0;
+
+        value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
             value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0);
         }
@@ -2304,7 +2343,8 @@ void partition_mesh(
         gauss_point.fields_vec.host(i, 0) = value;
         gauss_point.fields_vec.host(i, 1) = value;
         gauss_point.fields_vec.host(i, 2) = value;
-    }
+    });
+
     gauss_point.fields_vec.update_device();
 
 
@@ -2336,18 +2376,19 @@ void partition_mesh(
 
 
     // Update scalar field to visualize the communication
+    FOR_ALL(i, 0, final_mesh.num_elems, {
 
-    for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) {
         double value = 0.0;
         for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j));
+            value += final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j));
         }
         value /= final_mesh.num_nodes_in_elem;
 
         for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value;
+            final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j)) = value;
         }
-    }
+    });
+
 }
 
 #endif // DECOMP_UTILS_H
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index a5de7a8b..92732a88 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {200, 200, 200};
+    int num_elems_dim[3] = {60, 60, 60};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;

From 4cc2a709ef8b8775eadbb0e4231f96c61db9aec9 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 14:00:18 -0600
Subject: [PATCH 42/52] BUG: Chasing CUDA build and run bugs

---
 examples/mesh_decomp/decomp_utils.h  | 112 +++++++++++++++------------
 examples/mesh_decomp/mesh_decomp.cpp |   2 +-
 examples/mesh_decomp/mesh_io.h       |  27 +++++--
 src/include/communication_plan.h     |   6 +-
 4 files changed, 88 insertions(+), 59 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 617d4014..6eb758ee 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -398,7 +398,7 @@ void naive_partition_mesh(
             std::cout<< "Rank = "<< i <<std::endl;
 
             for(int k = 0; k < elements_to_send[i].size(); k++) {
-                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem(elements_to_send[i][k]));
+                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem.host(elements_to_send[i][k]));
             }
 
             std::cout<< " Finished all_num_elem_elem" <<std::endl;
@@ -622,6 +622,8 @@ void build_ghost(
     // Rank 1: elem_count[1] /
     // Rank 2: elem_count[2] /
 
+    int num_dim = input_mesh.num_dims;
+
     int nodes_per_elem = input_mesh.num_nodes_in_elem;
 
     // MPI_Allgather: Each rank sends its element count, every rank receives
@@ -1133,16 +1135,15 @@ void build_ghost(
     output_mesh.build_connectivity();
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building final mesh structure" << std::endl;
 
-    if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
 
     // ****************************************************************************************** 
     //     Build the final nodes that include ghost
     // ****************************************************************************************** 
 
 
-    output_node.initialize(total_extended_nodes, 3, {node_state::coords}, node_communication_plan);
+    output_node.initialize(total_extended_nodes, num_dim, {node_state::coords}, node_communication_plan);
     MPI_Barrier(MPI_COMM_WORLD);
 
     // The goal here is to populate output_node.coords using globally gathered ghost node coordinates,
@@ -1203,35 +1204,37 @@ void build_ghost(
 
 
     // d) Global coords (size: total_owned x 3)
-    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
+    std::vector<double> owned_coords_send(num_dim*local_owned_count, 0.0);
     for (int i = 0; i < local_owned_count; i++) {
-        owned_coords_send[3*i+0] = input_node.coords.host(i,0);
-        owned_coords_send[3*i+1] = input_node.coords.host(i,1);
-        owned_coords_send[3*i+2] = input_node.coords.host(i,2);
+        for(int dim = 0; dim < num_dim; dim++){
+            owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim);
+        }
     }
-    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
+    std::vector<double> all_owned_coords(num_dim * total_owned, 0.0);
 
     // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Getting coord_counts" << std::endl;
+
     std::vector<int> coord_counts(world_size);
     std::vector<int> coord_displs(world_size);
     for (int r = 0; r < world_size; r++) {
-        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
-        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
+        coord_counts[r] = num_dim * owned_counts[r];  // Each node has num_dim doubles
+        coord_displs[r] = num_dim * owned_displs[r];  // Displacement in doubles
     }
 
-    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
+    MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE,
                 all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
                 MPI_DOUBLE, MPI_COMM_WORLD);
 
     // e) Build map: gid -> coord[3]
-    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
+    std::unordered_map<size_t, std::vector<double>> gid_to_coord;
     for (int i = 0; i < total_owned; i++) {
-        std::array<double,3> xyz = {
-            all_owned_coords[3*i+0],
-            all_owned_coords[3*i+1],
-            all_owned_coords[3*i+2]
-        };
-        gid_to_coord[all_owned_gids[i]] = xyz;
+        std::vector<double> xyz(num_dim);  // size is runtime-dependent
+        for (int dim = 0; dim < num_dim; dim++) {
+            xyz[dim] = all_owned_coords[num_dim * i + dim];
+        }
+        gid_to_coord[all_owned_gids[i]] = std::move(xyz);
     }
 
     // 4. Finally, fill output_node.coords with correct coordinates.
@@ -1239,14 +1242,14 @@ void build_ghost(
         size_t gid = output_mesh.local_to_global_node_mapping.host(i);
         auto it = gid_to_coord.find(gid);
         if (it != gid_to_coord.end()) {
-            output_node.coords.host(i,0) = it->second[0];
-            output_node.coords.host(i,1) = it->second[1];
-            output_node.coords.host(i,2) = it->second[2];
+            for (int dim = 0; dim < num_dim; dim++) {
+                output_node.coords.host(i,dim) = it->second[dim];
+            }
         } else {
             // Could happen if there's a bug: fill with zeros for safety
-            output_node.coords.host(i,0) = 0.0;
-            output_node.coords.host(i,1) = 0.0;
-            output_node.coords.host(i,2) = 0.0;
+            for (int dim = 0; dim < num_dim; dim++) {
+                output_node.coords.host(i,dim) = 0.0;
+            }
         }
     }
     output_node.coords.update_device();
@@ -1314,6 +1317,7 @@ void build_ghost(
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<"After boundary_elem_targets"<<std::endl;
 
     // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
     std::vector<int> boundary_elem_local_ids;
@@ -1354,7 +1358,7 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
     output_mesh.num_boundary_elems = boundary_elem_local_ids.size();
-    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems);
+    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems, "boundary_elem_local_ids");
     for (int i = 0; i < output_mesh.num_boundary_elems; i++) {
         output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
     }
@@ -1421,7 +1425,7 @@ void build_ghost(
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
     // Could be used to specify communication volume if needed for optimization
-    int* sourceweights = MPI_UNWEIGHTED;
+    // int* sourceweights = MPI_UNWEIGHTED;
     
     // ---------- Prepare OUTGOING edges (destinations) ----------
     // outdegree: Number of ranks to which this rank will SEND data
@@ -1448,7 +1452,7 @@ void build_ghost(
     int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED;
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
-    int* node_sourceweights = MPI_UNWEIGHTED;   
+    //int* node_sourceweights = MPI_UNWEIGHTED;   
 
     // ---------- Prepare OUTGOING edges (destinations) ----------
     // outdegree: Number of ranks to which this rank will SEND data
@@ -1457,11 +1461,12 @@ void build_ghost(
     int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED;
 
     // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
-    int* node_destinationweights = MPI_UNWEIGHTED;
+    // int* node_destinationweights = MPI_UNWEIGHTED;
 
     // Initialize the graph communicator for node communication
     node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources);
     MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout<<"After node graph communicator"<<std::endl;
 
     // ****************************************************************************************** 
     //     Build send counts and displacements for element communication
@@ -1489,11 +1494,12 @@ void build_ghost(
     }
 
     // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks);
+    DCArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send");
     for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
         int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        strides_array(i) = elems_to_send_by_rank[dest_rank].size();
+        strides_array.host(i) = elems_to_send_by_rank[dest_rank].size();
     }
+    strides_array.update_device();
     DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
 
     // Fill in the data
@@ -1517,12 +1523,13 @@ void build_ghost(
     }
 
     // ========== Serialize into a DRaggedRightArrayKokkos ==========
-    CArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks);
+    DCArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array");
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
         int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
+        elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size();
        
     }
+    elem_recv_strides_array.update_device();
     DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
     // Fill in the data
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
@@ -1532,6 +1539,7 @@ void build_ghost(
         }
     }
     elems_to_recv_by_rank_rr.update_device();
+    MATAR_FENCE();
     element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
 
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1547,11 +1555,12 @@ void build_ghost(
     // --------------------------------------------------------------------------------------
 
     // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks);
+    DCArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array");
     for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
         int dest_rank = node_communication_plan.send_rank_ids.host(i);
-        node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size();
+        node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size();
     }
+    node_send_strides_array.update_device();
     DRaggedRightArrayKokkos<int> nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank");
 
     // Fill in the data
@@ -1616,11 +1625,12 @@ void build_ghost(
     }
     
     // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks);
+    DCArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array");
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
         int source_rank = node_communication_plan.recv_rank_ids.host(i);
-        nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size();
+        nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size();
     }
+    nodes_recv_strides_array.update_device();
     DRaggedRightArrayKokkos<int> nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank");
     // Fill in the data
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
@@ -1681,7 +1691,7 @@ void partition_mesh(
     int rank){
 
     bool print_info = false;
-    bool print_vtk = false;
+    // bool print_vtk = false;
 
     int num_dim = initial_mesh.num_dims;
 
@@ -1809,7 +1819,7 @@ void partition_mesh(
     for (size_t k = 0; k < naive_mesh.num_elems; k++) {
         int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
         elem_gid_to_offset[elem_gid_on_rank] = current_offset;
-        current_offset += num_elems_in_elem_per_rank(k); 
+        current_offset += num_elems_in_elem_per_rank.host(k); 
     }
 
     // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
@@ -1836,11 +1846,11 @@ void partition_mesh(
                 break;
             }
         }
-        size_t num_nbrs = num_elems_in_elem_per_rank(idx);
+        size_t num_nbrs = num_elems_in_elem_per_rank.host(idx);
 
         // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
         for (size_t j = 0; j < num_nbrs; j++) {
-            size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID!
+            size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID!
             edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
             ++offset; // Increment running edge count
         }
@@ -2206,8 +2216,8 @@ void partition_mesh(
     // -------------- Phase 6: Build the intermediate_mesh --------------
     intermediate_mesh.initialize_nodes(num_new_nodes);
     intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
-    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
-    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping");
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping");
 
     // Fill global mappings
     for (int i = 0; i < num_new_nodes; i++)
@@ -2330,19 +2340,19 @@ void partition_mesh(
     FOR_ALL(i, 0, final_mesh.num_elems, {
         double value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
-            value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j));
+            value += gauss_point.fields(final_mesh.elems_in_elem(i, j));
         }
         value /= final_mesh.num_elems_in_elem(i);
-        gauss_point.fields.host(i) = value;
+        gauss_point.fields(i) = value;
 
         value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
-            value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0);
+            value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0);
         }
         value /= final_mesh.num_elems_in_elem(i);
-        gauss_point.fields_vec.host(i, 0) = value;
-        gauss_point.fields_vec.host(i, 1) = value;
-        gauss_point.fields_vec.host(i, 2) = value;
+        gauss_point.fields_vec(i, 0) = value;
+        gauss_point.fields_vec(i, 1) = value;
+        gauss_point.fields_vec(i, 2) = value;
     });
 
     gauss_point.fields_vec.update_device();
@@ -2380,12 +2390,12 @@ void partition_mesh(
 
         double value = 0.0;
         for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            value += final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j));
+            value += final_node.scalar_field(final_mesh.nodes_in_elem(i, j));
         }
         value /= final_mesh.num_nodes_in_elem;
 
         for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j)) = value;
+            final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value;
         }
     });
 
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index 92732a88..a580052a 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {60, 60, 60};
+    int num_elems_dim[3] = {20, 20, 20};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index a3530a07..04c2cad1 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -191,12 +191,13 @@ void build_3d_box(
     // --- Build nodes ---
 
     CArrayDual<double> origin_mtr(3, "origin_mtr");
-    origin_mtr(0) = origin[0];
-    origin_mtr(1) = origin[1];
-    origin_mtr(2) = origin[2];
+    origin_mtr.host(0) = origin[0];
+    origin_mtr.host(1) = origin[1];
+    origin_mtr.host(2) = origin[2];
     origin_mtr.update_device();
 
     // populate the point data structures
+    std::cout<<"First FOR_ALL"<<std::endl;
     FOR_ALL(k, 0, num_points_k,
             j, 0, num_points_j,
             i, 0, num_points_i,{
@@ -216,6 +217,7 @@ void build_3d_box(
     mesh.initialize_elems(num_elems, num_dim);
 
     // populate the point data structures
+    std::cout<<"Second FOR_ALL"<<std::endl;
     FOR_ALL(k, 0, num_elems_k,
             j, 0, num_elems_j,
             i, 0, num_elems_i,{
@@ -579,10 +581,17 @@ void write_vtu(Mesh_t& mesh,
     // save the cell state to an array for exporting to graphics files
     auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
     auto elem_vec_fields = CArray<double>(num_elems, num_cell_vec_vars, 3);
+
+    DCArrayKokkos <double> num_elems_in_elem(mesh.num_elems, "tmp_num_elem_in_elem");
+    FOR_ALL(i, 0, mesh.num_elems, {
+        num_elems_in_elem(i) = (double)mesh.num_elems_in_elem(i);
+    });
+    MATAR_FENCE();
+    num_elems_in_elem.update_host();
     
     for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
         elem_fields(elem_gid, 0) = rank;
-        elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
+        elem_fields(elem_gid, 1) = num_elems_in_elem.host(elem_gid);
         elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
         elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid);
         elem_vec_fields(elem_gid, 0, 0) = gauss_point.fields_vec.host(elem_gid, 0);
@@ -594,6 +603,14 @@ void write_vtu(Mesh_t& mesh,
     CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
     CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
 
+
+    DCArrayKokkos <double> num_elems_in_node(mesh.num_elems, "tmp_num_elems_in_node");
+    FOR_ALL(i, 0, mesh.num_elems, {
+        num_elems_in_node(i) = (double)mesh.num_corners_in_node(i);
+    });
+    MATAR_FENCE();
+    num_elems_in_node.update_host();
+
     for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
         // position, var 0
         vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0);
@@ -606,7 +623,7 @@ void write_vtu(Mesh_t& mesh,
         vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2);
 
         point_scalar_fields(node_gid, 0) = rank;
-        point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
+        point_scalar_fields(node_gid, 1) = num_elems_in_node.host(node_gid);
         point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid);
         point_scalar_fields(node_gid, 3) = node.scalar_field.host(node_gid);
     }
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index 3c1c48e9..21091eb2 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -187,14 +187,16 @@ enum class communication_plan_type {
         // Copy and store send neighbor IDs (out-bound neighbors: where we will send data to)
         this->send_rank_ids = DCArrayKokkos<int>(num_send_ranks, "send_rank_ids");
         for(int i = 0; i < num_send_ranks; i++){
-            this->send_rank_ids(i) = send_rank_ids[i];
+            this->send_rank_ids.host(i) = send_rank_ids[i];
         }
+        this->send_rank_ids.update_device();
 
         // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from)
         this->recv_rank_ids = DCArrayKokkos<int>(num_recv_ranks, "recv_rank_ids");
         for(int i = 0; i < num_recv_ranks; i++){
-            this->recv_rank_ids(i) = recv_rank_ids[i];
+            this->recv_rank_ids.host(i) = recv_rank_ids[i];
         }
+        this->recv_rank_ids.update_device();
         
         // Create the distributed graph communicator.
         // This call links this process to its explicit send and receive neighbors.

From 6ac9606899a516416cead8bbf9810bf27b84fa1b Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 14:10:51 -0600
Subject: [PATCH 43/52] BUG: Fixed GPU build, broke node comms, WIP

---
 examples/mesh_decomp/decomp_utils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 6eb758ee..d43aa497 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -2354,8 +2354,8 @@ void partition_mesh(
         gauss_point.fields_vec(i, 1) = value;
         gauss_point.fields_vec(i, 2) = value;
     });
-
-    gauss_point.fields_vec.update_device();
+    gauss_point.fields.update_host();
+    gauss_point.fields_vec.update_host();
 
 
 
@@ -2398,7 +2398,7 @@ void partition_mesh(
             final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value;
         }
     });
-
+    final_node.scalar_field.update_host();
 }
 
 #endif // DECOMP_UTILS_H
\ No newline at end of file

From 9217adb7017d2d839eafdbb838a5dd593d85d86c Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 15:14:13 -0600
Subject: [PATCH 44/52] BUG: Chasing nodal comms bug

---
 examples/mesh_decomp/decomp_utils.h | 139 ++++++++++++++--------------
 1 file changed, 68 insertions(+), 71 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index d43aa497..e0600761 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -398,7 +398,7 @@ void naive_partition_mesh(
             std::cout<< "Rank = "<< i <<std::endl;
 
             for(int k = 0; k < elements_to_send[i].size(); k++) {
-                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem.host(elements_to_send[i][k]));
+                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem(elements_to_send[i][k]));
             }
 
             std::cout<< " Finished all_num_elem_elem" <<std::endl;
@@ -622,8 +622,6 @@ void build_ghost(
     // Rank 1: elem_count[1] /
     // Rank 2: elem_count[2] /
 
-    int num_dim = input_mesh.num_dims;
-
     int nodes_per_elem = input_mesh.num_nodes_in_elem;
 
     // MPI_Allgather: Each rank sends its element count, every rank receives
@@ -1135,15 +1133,16 @@ void build_ghost(
     output_mesh.build_connectivity();
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Finished building final mesh structure" << std::endl;
 
+    if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
 
     // ****************************************************************************************** 
     //     Build the final nodes that include ghost
     // ****************************************************************************************** 
 
 
-    output_node.initialize(total_extended_nodes, num_dim, {node_state::coords}, node_communication_plan);
+    output_node.initialize(total_extended_nodes, 3, {node_state::coords}, node_communication_plan);
     MPI_Barrier(MPI_COMM_WORLD);
 
     // The goal here is to populate output_node.coords using globally gathered ghost node coordinates,
@@ -1204,37 +1203,35 @@ void build_ghost(
 
 
     // d) Global coords (size: total_owned x 3)
-    std::vector<double> owned_coords_send(num_dim*local_owned_count, 0.0);
+    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
     for (int i = 0; i < local_owned_count; i++) {
-        for(int dim = 0; dim < num_dim; dim++){
-            owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim);
-        }
+        owned_coords_send[3*i+0] = input_node.coords.host(i,0);
+        owned_coords_send[3*i+1] = input_node.coords.host(i,1);
+        owned_coords_send[3*i+2] = input_node.coords.host(i,2);
     }
-    std::vector<double> all_owned_coords(num_dim * total_owned, 0.0);
+    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
 
     // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
-    MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout << " Getting coord_counts" << std::endl;
-
     std::vector<int> coord_counts(world_size);
     std::vector<int> coord_displs(world_size);
     for (int r = 0; r < world_size; r++) {
-        coord_counts[r] = num_dim * owned_counts[r];  // Each node has num_dim doubles
-        coord_displs[r] = num_dim * owned_displs[r];  // Displacement in doubles
+        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
+        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
     }
 
-    MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE,
+    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
                 all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
                 MPI_DOUBLE, MPI_COMM_WORLD);
 
     // e) Build map: gid -> coord[3]
-    std::unordered_map<size_t, std::vector<double>> gid_to_coord;
+    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
     for (int i = 0; i < total_owned; i++) {
-        std::vector<double> xyz(num_dim);  // size is runtime-dependent
-        for (int dim = 0; dim < num_dim; dim++) {
-            xyz[dim] = all_owned_coords[num_dim * i + dim];
-        }
-        gid_to_coord[all_owned_gids[i]] = std::move(xyz);
+        std::array<double,3> xyz = {
+            all_owned_coords[3*i+0],
+            all_owned_coords[3*i+1],
+            all_owned_coords[3*i+2]
+        };
+        gid_to_coord[all_owned_gids[i]] = xyz;
     }
 
     // 4. Finally, fill output_node.coords with correct coordinates.
@@ -1242,14 +1239,14 @@ void build_ghost(
         size_t gid = output_mesh.local_to_global_node_mapping.host(i);
         auto it = gid_to_coord.find(gid);
         if (it != gid_to_coord.end()) {
-            for (int dim = 0; dim < num_dim; dim++) {
-                output_node.coords.host(i,dim) = it->second[dim];
-            }
+            output_node.coords.host(i,0) = it->second[0];
+            output_node.coords.host(i,1) = it->second[1];
+            output_node.coords.host(i,2) = it->second[2];
         } else {
             // Could happen if there's a bug: fill with zeros for safety
-            for (int dim = 0; dim < num_dim; dim++) {
-                output_node.coords.host(i,dim) = 0.0;
-            }
+            output_node.coords.host(i,0) = 0.0;
+            output_node.coords.host(i,1) = 0.0;
+            output_node.coords.host(i,2) = 0.0;
         }
     }
     output_node.coords.update_device();
@@ -1317,7 +1314,6 @@ void build_ghost(
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
-    if(rank == 0) std::cout<<"After boundary_elem_targets"<<std::endl;
 
     // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
     std::vector<int> boundary_elem_local_ids;
@@ -1358,7 +1354,7 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
     output_mesh.num_boundary_elems = boundary_elem_local_ids.size();
-    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems, "boundary_elem_local_ids");
+    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems);
     for (int i = 0; i < output_mesh.num_boundary_elems; i++) {
         output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
     }
@@ -1425,7 +1421,7 @@ void build_ghost(
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
     // Could be used to specify communication volume if needed for optimization
-    // int* sourceweights = MPI_UNWEIGHTED;
+    int* sourceweights = MPI_UNWEIGHTED;
     
     // ---------- Prepare OUTGOING edges (destinations) ----------
     // outdegree: Number of ranks to which this rank will SEND data
@@ -1452,7 +1448,7 @@ void build_ghost(
     int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED;
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
-    //int* node_sourceweights = MPI_UNWEIGHTED;   
+    int* node_sourceweights = MPI_UNWEIGHTED;   
 
     // ---------- Prepare OUTGOING edges (destinations) ----------
     // outdegree: Number of ranks to which this rank will SEND data
@@ -1461,12 +1457,11 @@ void build_ghost(
     int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED;
 
     // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
-    // int* node_destinationweights = MPI_UNWEIGHTED;
+    int* node_destinationweights = MPI_UNWEIGHTED;
 
     // Initialize the graph communicator for node communication
     node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources);
     MPI_Barrier(MPI_COMM_WORLD);
-    if (rank == 0) std::cout<<"After node graph communicator"<<std::endl;
 
     // ****************************************************************************************** 
     //     Build send counts and displacements for element communication
@@ -1494,12 +1489,11 @@ void build_ghost(
     }
 
     // Serialize into a DRaggedRightArrayKokkos
-    DCArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send");
+    CArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks);
     for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
         int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        strides_array.host(i) = elems_to_send_by_rank[dest_rank].size();
+        strides_array(i) = elems_to_send_by_rank[dest_rank].size();
     }
-    strides_array.update_device();
     DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
 
     // Fill in the data
@@ -1523,13 +1517,12 @@ void build_ghost(
     }
 
     // ========== Serialize into a DRaggedRightArrayKokkos ==========
-    DCArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array");
+    CArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks);
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
         int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size();
+        elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
        
     }
-    elem_recv_strides_array.update_device();
     DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
     // Fill in the data
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
@@ -1539,7 +1532,6 @@ void build_ghost(
         }
     }
     elems_to_recv_by_rank_rr.update_device();
-    MATAR_FENCE();
     element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
 
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1555,12 +1547,11 @@ void build_ghost(
     // --------------------------------------------------------------------------------------
 
     // Serialize into a DRaggedRightArrayKokkos
-    DCArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array");
+    CArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks);
     for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
         int dest_rank = node_communication_plan.send_rank_ids.host(i);
-        node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size();
+        node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size();
     }
-    node_send_strides_array.update_device();
     DRaggedRightArrayKokkos<int> nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank");
 
     // Fill in the data
@@ -1625,12 +1616,11 @@ void build_ghost(
     }
     
     // Serialize into a DRaggedRightArrayKokkos
-    DCArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array");
+    CArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks);
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
         int source_rank = node_communication_plan.recv_rank_ids.host(i);
-        nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size();
+        nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size();
     }
-    nodes_recv_strides_array.update_device();
     DRaggedRightArrayKokkos<int> nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank");
     // Fill in the data
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
@@ -1691,7 +1681,7 @@ void partition_mesh(
     int rank){
 
     bool print_info = false;
-    // bool print_vtk = false;
+    bool print_vtk = false;
 
     int num_dim = initial_mesh.num_dims;
 
@@ -1819,7 +1809,7 @@ void partition_mesh(
     for (size_t k = 0; k < naive_mesh.num_elems; k++) {
         int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
         elem_gid_to_offset[elem_gid_on_rank] = current_offset;
-        current_offset += num_elems_in_elem_per_rank.host(k); 
+        current_offset += num_elems_in_elem_per_rank(k); 
     }
 
     // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
@@ -1846,11 +1836,11 @@ void partition_mesh(
                 break;
             }
         }
-        size_t num_nbrs = num_elems_in_elem_per_rank.host(idx);
+        size_t num_nbrs = num_elems_in_elem_per_rank(idx);
 
         // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
         for (size_t j = 0; j < num_nbrs; j++) {
-            size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID!
+            size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID!
             edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
             ++offset; // Increment running edge count
         }
@@ -2216,8 +2206,8 @@ void partition_mesh(
     // -------------- Phase 6: Build the intermediate_mesh --------------
     intermediate_mesh.initialize_nodes(num_new_nodes);
     intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
-    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping");
-    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping");
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
 
     // Fill global mappings
     for (int i = 0; i < num_new_nodes; i++)
@@ -2325,10 +2315,10 @@ void partition_mesh(
         gauss_point.fields_vec.host(i, 2) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
-        gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
-        gauss_point.fields_vec.host(i, 0) = -1.0;
-        gauss_point.fields_vec.host(i, 1) = -1.0;
-        gauss_point.fields_vec.host(i, 2) = -1.0;
+        gauss_point.fields.host(i) = -100.0;  // Ghost elements should be updated
+        gauss_point.fields_vec.host(i, 0) = -100.0;
+        gauss_point.fields_vec.host(i, 1) = -100.0;
+        gauss_point.fields_vec.host(i, 2) = -100.0;
     }
     gauss_point.fields.update_device();
     gauss_point.fields_vec.update_device();
@@ -2337,25 +2327,32 @@ void partition_mesh(
     gauss_point.fields_vec.communicate();
     
     // Loop over all elements and average the values of elements connected to that element
-    FOR_ALL(i, 0, final_mesh.num_elems, {
+
+    CArrayKokkos <double> tmp_store(final_mesh.num_elems);
+    FOR_ALL(i, 0, final_mesh.num_owned_elems, {
         double value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
-            value += gauss_point.fields(final_mesh.elems_in_elem(i, j));
+            value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j));
         }
         value /= final_mesh.num_elems_in_elem(i);
-        gauss_point.fields(i) = value;
+        tmp_store(i) = value;
+        // gauss_point.fields(i) = value;
 
         value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
-            value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0);
+            value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0);
         }
         value /= final_mesh.num_elems_in_elem(i);
-        gauss_point.fields_vec(i, 0) = value;
-        gauss_point.fields_vec(i, 1) = value;
-        gauss_point.fields_vec(i, 2) = value;
+        gauss_point.fields_vec.host(i, 0) = value;
+        gauss_point.fields_vec.host(i, 1) = value;
+        gauss_point.fields_vec.host(i, 2) = value;
     });
-    gauss_point.fields.update_host();
-    gauss_point.fields_vec.update_host();
+
+    FOR_ALL(i, 0, final_mesh.num_owned_elems, {
+        gauss_point.fields(i) = tmp_store(i);
+    });
+
+    gauss_point.fields_vec.update_device();
 
 
 
@@ -2370,10 +2367,10 @@ void partition_mesh(
         final_node.vector_field.host(i, 2) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
-        final_node.scalar_field.host(i) = -1.0;
-        final_node.vector_field.host(i, 0) = -1.0;
-        final_node.vector_field.host(i, 1) = -1.0;
-        final_node.vector_field.host(i, 2) = -1.0;
+        final_node.scalar_field.host(i) = -100.0;
+        final_node.vector_field.host(i, 0) = -100.0;
+        final_node.vector_field.host(i, 1) = -100.0;
+        final_node.vector_field.host(i, 2) = -100.0;
     }
 
     final_node.coords.update_device();
@@ -2398,7 +2395,7 @@ void partition_mesh(
             final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value;
         }
     });
-    final_node.scalar_field.update_host();
+
 }
 
 #endif // DECOMP_UTILS_H
\ No newline at end of file

From e8602a2844274cfbdae6607b19364dfc4af7cacf Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 15:47:20 -0600
Subject: [PATCH 45/52] Revert "BUG: Chasing nodal comms bug"

This reverts commit 9217adb7017d2d839eafdbb838a5dd593d85d86c.
---
 examples/mesh_decomp/decomp_utils.h | 139 ++++++++++++++--------------
 1 file changed, 71 insertions(+), 68 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index e0600761..d43aa497 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -398,7 +398,7 @@ void naive_partition_mesh(
             std::cout<< "Rank = "<< i <<std::endl;
 
             for(int k = 0; k < elements_to_send[i].size(); k++) {
-                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem(elements_to_send[i][k]));
+                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem.host(elements_to_send[i][k]));
             }
 
             std::cout<< " Finished all_num_elem_elem" <<std::endl;
@@ -622,6 +622,8 @@ void build_ghost(
     // Rank 1: elem_count[1] /
     // Rank 2: elem_count[2] /
 
+    int num_dim = input_mesh.num_dims;
+
     int nodes_per_elem = input_mesh.num_nodes_in_elem;
 
     // MPI_Allgather: Each rank sends its element count, every rank receives
@@ -1133,16 +1135,15 @@ void build_ghost(
     output_mesh.build_connectivity();
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building final mesh structure" << std::endl;
 
-    if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl;
-    MPI_Barrier(MPI_COMM_WORLD);
 
     // ****************************************************************************************** 
     //     Build the final nodes that include ghost
     // ****************************************************************************************** 
 
 
-    output_node.initialize(total_extended_nodes, 3, {node_state::coords}, node_communication_plan);
+    output_node.initialize(total_extended_nodes, num_dim, {node_state::coords}, node_communication_plan);
     MPI_Barrier(MPI_COMM_WORLD);
 
     // The goal here is to populate output_node.coords using globally gathered ghost node coordinates,
@@ -1203,35 +1204,37 @@ void build_ghost(
 
 
     // d) Global coords (size: total_owned x 3)
-    std::vector<double> owned_coords_send(3*local_owned_count, 0.0);
+    std::vector<double> owned_coords_send(num_dim*local_owned_count, 0.0);
     for (int i = 0; i < local_owned_count; i++) {
-        owned_coords_send[3*i+0] = input_node.coords.host(i,0);
-        owned_coords_send[3*i+1] = input_node.coords.host(i,1);
-        owned_coords_send[3*i+2] = input_node.coords.host(i,2);
+        for(int dim = 0; dim < num_dim; dim++){
+            owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim);
+        }
     }
-    std::vector<double> all_owned_coords(3 * total_owned, 0.0);
+    std::vector<double> all_owned_coords(num_dim * total_owned, 0.0);
 
     // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Getting coord_counts" << std::endl;
+
     std::vector<int> coord_counts(world_size);
     std::vector<int> coord_displs(world_size);
     for (int r = 0; r < world_size; r++) {
-        coord_counts[r] = 3 * owned_counts[r];  // Each node has 3 doubles
-        coord_displs[r] = 3 * owned_displs[r];  // Displacement in doubles
+        coord_counts[r] = num_dim * owned_counts[r];  // Each node has num_dim doubles
+        coord_displs[r] = num_dim * owned_displs[r];  // Displacement in doubles
     }
 
-    MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE,
+    MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE,
                 all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
                 MPI_DOUBLE, MPI_COMM_WORLD);
 
     // e) Build map: gid -> coord[3]
-    std::unordered_map<size_t, std::array<double,3>> gid_to_coord;
+    std::unordered_map<size_t, std::vector<double>> gid_to_coord;
     for (int i = 0; i < total_owned; i++) {
-        std::array<double,3> xyz = {
-            all_owned_coords[3*i+0],
-            all_owned_coords[3*i+1],
-            all_owned_coords[3*i+2]
-        };
-        gid_to_coord[all_owned_gids[i]] = xyz;
+        std::vector<double> xyz(num_dim);  // size is runtime-dependent
+        for (int dim = 0; dim < num_dim; dim++) {
+            xyz[dim] = all_owned_coords[num_dim * i + dim];
+        }
+        gid_to_coord[all_owned_gids[i]] = std::move(xyz);
     }
 
     // 4. Finally, fill output_node.coords with correct coordinates.
@@ -1239,14 +1242,14 @@ void build_ghost(
         size_t gid = output_mesh.local_to_global_node_mapping.host(i);
         auto it = gid_to_coord.find(gid);
         if (it != gid_to_coord.end()) {
-            output_node.coords.host(i,0) = it->second[0];
-            output_node.coords.host(i,1) = it->second[1];
-            output_node.coords.host(i,2) = it->second[2];
+            for (int dim = 0; dim < num_dim; dim++) {
+                output_node.coords.host(i,dim) = it->second[dim];
+            }
         } else {
             // Could happen if there's a bug: fill with zeros for safety
-            output_node.coords.host(i,0) = 0.0;
-            output_node.coords.host(i,1) = 0.0;
-            output_node.coords.host(i,2) = 0.0;
+            for (int dim = 0; dim < num_dim; dim++) {
+                output_node.coords.host(i,dim) = 0.0;
+            }
         }
     }
     output_node.coords.update_device();
@@ -1314,6 +1317,7 @@ void build_ghost(
     }
 
     MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<"After boundary_elem_targets"<<std::endl;
 
     // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
     std::vector<int> boundary_elem_local_ids;
@@ -1354,7 +1358,7 @@ void build_ghost(
     MPI_Barrier(MPI_COMM_WORLD);
 
     output_mesh.num_boundary_elems = boundary_elem_local_ids.size();
-    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems);
+    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems, "boundary_elem_local_ids");
     for (int i = 0; i < output_mesh.num_boundary_elems; i++) {
         output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
     }
@@ -1421,7 +1425,7 @@ void build_ghost(
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
     // Could be used to specify communication volume if needed for optimization
-    int* sourceweights = MPI_UNWEIGHTED;
+    // int* sourceweights = MPI_UNWEIGHTED;
     
     // ---------- Prepare OUTGOING edges (destinations) ----------
     // outdegree: Number of ranks to which this rank will SEND data
@@ -1448,7 +1452,7 @@ void build_ghost(
     int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED;
     
     // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
-    int* node_sourceweights = MPI_UNWEIGHTED;   
+    //int* node_sourceweights = MPI_UNWEIGHTED;   
 
     // ---------- Prepare OUTGOING edges (destinations) ----------
     // outdegree: Number of ranks to which this rank will SEND data
@@ -1457,11 +1461,12 @@ void build_ghost(
     int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED;
 
     // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
-    int* node_destinationweights = MPI_UNWEIGHTED;
+    // int* node_destinationweights = MPI_UNWEIGHTED;
 
     // Initialize the graph communicator for node communication
     node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources);
     MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout<<"After node graph communicator"<<std::endl;
 
     // ****************************************************************************************** 
     //     Build send counts and displacements for element communication
@@ -1489,11 +1494,12 @@ void build_ghost(
     }
 
     // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks);
+    DCArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send");
     for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
         int dest_rank = element_communication_plan.send_rank_ids.host(i);
-        strides_array(i) = elems_to_send_by_rank[dest_rank].size();
+        strides_array.host(i) = elems_to_send_by_rank[dest_rank].size();
     }
+    strides_array.update_device();
     DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
 
     // Fill in the data
@@ -1517,12 +1523,13 @@ void build_ghost(
     }
 
     // ========== Serialize into a DRaggedRightArrayKokkos ==========
-    CArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks);
+    DCArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array");
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
         int source_rank = element_communication_plan.recv_rank_ids.host(i);
-        elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size();
+        elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size();
        
     }
+    elem_recv_strides_array.update_device();
     DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
     // Fill in the data
     for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
@@ -1532,6 +1539,7 @@ void build_ghost(
         }
     }
     elems_to_recv_by_rank_rr.update_device();
+    MATAR_FENCE();
     element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
 
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1547,11 +1555,12 @@ void build_ghost(
     // --------------------------------------------------------------------------------------
 
     // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks);
+    DCArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array");
     for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
         int dest_rank = node_communication_plan.send_rank_ids.host(i);
-        node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size();
+        node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size();
     }
+    node_send_strides_array.update_device();
     DRaggedRightArrayKokkos<int> nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank");
 
     // Fill in the data
@@ -1616,11 +1625,12 @@ void build_ghost(
     }
     
     // Serialize into a DRaggedRightArrayKokkos
-    CArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks);
+    DCArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array");
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
         int source_rank = node_communication_plan.recv_rank_ids.host(i);
-        nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size();
+        nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size();
     }
+    nodes_recv_strides_array.update_device();
     DRaggedRightArrayKokkos<int> nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank");
     // Fill in the data
     for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
@@ -1681,7 +1691,7 @@ void partition_mesh(
     int rank){
 
     bool print_info = false;
-    bool print_vtk = false;
+    // bool print_vtk = false;
 
     int num_dim = initial_mesh.num_dims;
 
@@ -1809,7 +1819,7 @@ void partition_mesh(
     for (size_t k = 0; k < naive_mesh.num_elems; k++) {
         int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
         elem_gid_to_offset[elem_gid_on_rank] = current_offset;
-        current_offset += num_elems_in_elem_per_rank(k); 
+        current_offset += num_elems_in_elem_per_rank.host(k); 
     }
 
     // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
@@ -1836,11 +1846,11 @@ void partition_mesh(
                 break;
             }
         }
-        size_t num_nbrs = num_elems_in_elem_per_rank(idx);
+        size_t num_nbrs = num_elems_in_elem_per_rank.host(idx);
 
         // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
         for (size_t j = 0; j < num_nbrs; j++) {
-            size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID!
+            size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID!
             edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
             ++offset; // Increment running edge count
         }
@@ -2206,8 +2216,8 @@ void partition_mesh(
     // -------------- Phase 6: Build the intermediate_mesh --------------
     intermediate_mesh.initialize_nodes(num_new_nodes);
     intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
-    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes);
-    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems);
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping");
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping");
 
     // Fill global mappings
     for (int i = 0; i < num_new_nodes; i++)
@@ -2315,10 +2325,10 @@ void partition_mesh(
         gauss_point.fields_vec.host(i, 2) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
-        gauss_point.fields.host(i) = -100.0;  // Ghost elements should be updated
-        gauss_point.fields_vec.host(i, 0) = -100.0;
-        gauss_point.fields_vec.host(i, 1) = -100.0;
-        gauss_point.fields_vec.host(i, 2) = -100.0;
+        gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
+        gauss_point.fields_vec.host(i, 0) = -1.0;
+        gauss_point.fields_vec.host(i, 1) = -1.0;
+        gauss_point.fields_vec.host(i, 2) = -1.0;
     }
     gauss_point.fields.update_device();
     gauss_point.fields_vec.update_device();
@@ -2327,32 +2337,25 @@ void partition_mesh(
     gauss_point.fields_vec.communicate();
     
     // Loop over all elements and average the values of elements connected to that element
-
-    CArrayKokkos <double> tmp_store(final_mesh.num_elems);
-    FOR_ALL(i, 0, final_mesh.num_owned_elems, {
+    FOR_ALL(i, 0, final_mesh.num_elems, {
         double value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
-            value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j));
+            value += gauss_point.fields(final_mesh.elems_in_elem(i, j));
         }
         value /= final_mesh.num_elems_in_elem(i);
-        tmp_store(i) = value;
-        // gauss_point.fields(i) = value;
+        gauss_point.fields(i) = value;
 
         value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
-            value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0);
+            value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0);
         }
         value /= final_mesh.num_elems_in_elem(i);
-        gauss_point.fields_vec.host(i, 0) = value;
-        gauss_point.fields_vec.host(i, 1) = value;
-        gauss_point.fields_vec.host(i, 2) = value;
+        gauss_point.fields_vec(i, 0) = value;
+        gauss_point.fields_vec(i, 1) = value;
+        gauss_point.fields_vec(i, 2) = value;
     });
-
-    FOR_ALL(i, 0, final_mesh.num_owned_elems, {
-        gauss_point.fields(i) = tmp_store(i);
-    });
-
-    gauss_point.fields_vec.update_device();
+    gauss_point.fields.update_host();
+    gauss_point.fields_vec.update_host();
 
 
 
@@ -2367,10 +2370,10 @@ void partition_mesh(
         final_node.vector_field.host(i, 2) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
-        final_node.scalar_field.host(i) = -100.0;
-        final_node.vector_field.host(i, 0) = -100.0;
-        final_node.vector_field.host(i, 1) = -100.0;
-        final_node.vector_field.host(i, 2) = -100.0;
+        final_node.scalar_field.host(i) = -1.0;
+        final_node.vector_field.host(i, 0) = -1.0;
+        final_node.vector_field.host(i, 1) = -1.0;
+        final_node.vector_field.host(i, 2) = -1.0;
     }
 
     final_node.coords.update_device();
@@ -2395,7 +2398,7 @@ void partition_mesh(
             final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value;
         }
     });
-
+    final_node.scalar_field.update_host();
 }
 
 #endif // DECOMP_UTILS_H
\ No newline at end of file

From c7f500ad07ec334312af912f66cd9a12f822a824 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 16:06:03 -0600
Subject: [PATCH 46/52] BUG: Chasing cuda bug still

---
 examples/mesh_decomp/decomp_utils.h | 11 ++++++++++-
 examples/mesh_decomp/mesh_io.h      |  4 +---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index d43aa497..118e4fc5 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -2335,6 +2335,8 @@ void partition_mesh(
     
     gauss_point.fields.communicate();
     gauss_point.fields_vec.communicate();
+
+    CArrayKokkos <double> tmp(final_mesh.num_elems);
     
     // Loop over all elements and average the values of elements connected to that element
     FOR_ALL(i, 0, final_mesh.num_elems, {
@@ -2343,7 +2345,9 @@ void partition_mesh(
             value += gauss_point.fields(final_mesh.elems_in_elem(i, j));
         }
         value /= final_mesh.num_elems_in_elem(i);
-        gauss_point.fields(i) = value;
+
+        tmp(i) = value;
+        
 
         value = 0.0;
         for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
@@ -2354,6 +2358,11 @@ void partition_mesh(
         gauss_point.fields_vec(i, 1) = value;
         gauss_point.fields_vec(i, 2) = value;
     });
+
+    FOR_ALL(i, 0, final_mesh.num_elems, {
+        gauss_point.fields(i) = tmp(i);
+    });
+
     gauss_point.fields.update_host();
     gauss_point.fields_vec.update_host();
 
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 04c2cad1..79eec569 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -354,9 +354,6 @@ void build_3d_box(
             point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
             point_scalar_fields(node_gid, 2) = node.scalar_field.host(node_gid);
 
-            if(node_gid == 0) {
-                std::cout << "*******[rank " << rank << "]   - num_corners_in_node: " << mesh.num_corners_in_node(node_gid) << std::endl;
-            }
         } // end for loop over vertices
 
 
@@ -588,6 +585,7 @@ void write_vtu(Mesh_t& mesh,
     });
     MATAR_FENCE();
     num_elems_in_elem.update_host();
+    MATAR_FENCE();
     
     for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
         elem_fields(elem_gid, 0) = rank;

From 1953d5d047239bc974c20edc4be13a82516e5a3f Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Mon, 24 Nov 2025 17:14:42 -0600
Subject: [PATCH 47/52] BUG: Chasing cuda+MPI bug

---
 examples/mesh_decomp/decomp_utils.h | 62 ++++++++++++++++++++++++-----
 src/include/communication_plan.h    |  2 +
 src/include/mpi_types.h             |  2 +-
 3 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index 118e4fc5..ac88fc37 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -295,6 +295,7 @@ void naive_partition_mesh(
             dim, 0, num_dim,{
         naive_node.coords(node_id, dim) = node_pos_on_rank(node_id, dim);
     });
+    MATAR_FENCE();
 
     naive_node.coords.update_host();
 
@@ -433,16 +434,18 @@ void naive_partition_mesh(
                 tmp_elems_in_elem(elem_gid, i) = initial_mesh.elems_in_elem(elem_gid, i);
             } // end for i
         });  // end FOR_ALL elems
+        MATAR_FENCE();
         tmp_elems_in_elem.update_host();
-        Kokkos::fence();
+
 
 
         DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
         FOR_ALL(i, 0, initial_mesh.num_elems, {
             tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
         });
-        tmp_num_elems_in_elem.update_host();
         MATAR_FENCE();
+        tmp_num_elems_in_elem.update_host();
+        
         
         for(int i = 0; i < world_size; i++) {
             sendcounts[i] = elem_elem_counts[i];
@@ -2326,16 +2329,20 @@ void partition_mesh(
     }
     for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
         gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
-        gauss_point.fields_vec.host(i, 0) = -1.0;
-        gauss_point.fields_vec.host(i, 1) = -1.0;
-        gauss_point.fields_vec.host(i, 2) = -1.0;
+        gauss_point.fields_vec.host(i, 0) = -100.0;
+        gauss_point.fields_vec.host(i, 1) = -100.0;
+        gauss_point.fields_vec.host(i, 2) = -100.0;
     }
     gauss_point.fields.update_device();
     gauss_point.fields_vec.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
     
     gauss_point.fields.communicate();
     gauss_point.fields_vec.communicate();
 
+    MPI_Barrier(MPI_COMM_WORLD);
+
     CArrayKokkos <double> tmp(final_mesh.num_elems);
     
     // Loop over all elements and average the values of elements connected to that element
@@ -2358,10 +2365,12 @@ void partition_mesh(
         gauss_point.fields_vec(i, 1) = value;
         gauss_point.fields_vec(i, 2) = value;
     });
+    MATAR_FENCE();
 
     FOR_ALL(i, 0, final_mesh.num_elems, {
         gauss_point.fields(i) = tmp(i);
     });
+    MATAR_FENCE();
 
     gauss_point.fields.update_host();
     gauss_point.fields_vec.update_host();
@@ -2371,6 +2380,19 @@ void partition_mesh(
     // Test node communication using MPI_Neighbor_alltoallv
     std::vector<node_state> node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field};
     final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan);
+
+    for (int r = 0; r < world_size; r++) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned elements: " << final_mesh.num_owned_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost elements: " << final_mesh.num_elems - final_mesh.num_owned_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned nodes: " << final_mesh.num_owned_nodes << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << final_mesh.num_nodes - final_mesh.num_owned_nodes << std::endl;
+                std::cout << std::flush;
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
     
     for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
         final_node.scalar_field.host(i) = static_cast<double>(rank);
@@ -2379,15 +2401,19 @@ void partition_mesh(
         final_node.vector_field.host(i, 2) = static_cast<double>(rank);
     }
     for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
-        final_node.scalar_field.host(i) = -1.0;
-        final_node.vector_field.host(i, 0) = -1.0;
-        final_node.vector_field.host(i, 1) = -1.0;
-        final_node.vector_field.host(i, 2) = -1.0;
+        final_node.scalar_field.host(i) = -100.0;
+        final_node.vector_field.host(i, 0) = -100.0;
+        final_node.vector_field.host(i, 1) = -100.0;
+        final_node.vector_field.host(i, 2) = -100.0;
     }
 
     final_node.coords.update_device();
     final_node.scalar_field.update_device();
     final_node.vector_field.update_device();
+    MATAR_FENCE();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    node_communication_plan.verify_graph_communicator();
 
     final_node.scalar_field.communicate();
     // final_node.vector_field.communicate();
@@ -2395,6 +2421,8 @@ void partition_mesh(
 
 
     // Update scalar field to visualize the communication
+
+    CArrayKokkos <double> tmp_too(final_mesh.num_elems);
     FOR_ALL(i, 0, final_mesh.num_elems, {
 
         double value = 0.0;
@@ -2402,12 +2430,26 @@ void partition_mesh(
             value += final_node.scalar_field(final_mesh.nodes_in_elem(i, j));
         }
         value /= final_mesh.num_nodes_in_elem;
+        tmp_too(i) = value;
+    });
+    MATAR_FENCE();
 
+    FOR_ALL(i, 0, final_mesh.num_elems, {
         for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value;
+            final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = tmp_too(i);
         }
     });
+    MATAR_FENCE();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if(rank == 0)std::cout<<"Print from rank 0"<<std::endl;
+    if(rank == 1)std::cout<<"Print from rank 1"<<std::endl;
+
+    MATAR_FENCE();
     final_node.scalar_field.update_host();
+    MATAR_FENCE();
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 #endif // DECOMP_UTILS_H
\ No newline at end of file
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index 21091eb2..d0140d43 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -190,6 +190,7 @@ enum class communication_plan_type {
             this->send_rank_ids.host(i) = send_rank_ids[i];
         }
         this->send_rank_ids.update_device();
+        MATAR_FENCE();
 
         // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from)
         this->recv_rank_ids = DCArrayKokkos<int>(num_recv_ranks, "recv_rank_ids");
@@ -197,6 +198,7 @@ enum class communication_plan_type {
             this->recv_rank_ids.host(i) = recv_rank_ids[i];
         }
         this->recv_rank_ids.update_device();
+        MATAR_FENCE();
         
         // Create the distributed graph communicator.
         // This call links this process to its explicit send and receive neighbors.
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index b0999049..4546fd48 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -303,7 +303,7 @@ class MPICArrayKokkos {
     void communicate(){
 
         this_array_.update_host();
-       
+        MATAR_FENCE();
         fill_send_buffer();
 
         MPI_Neighbor_alltoallv(

From 1a3e45a427647e93d8a4c2382601ae5b2b8c69ca Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 25 Nov 2025 13:24:11 -0600
Subject: [PATCH 48/52] ENH: CUDA builds working

---
 examples/mesh_decomp/decomp_utils.h  | 97 ++++++++++++----------------
 examples/mesh_decomp/mesh_decomp.cpp |  2 +-
 examples/mesh_decomp/mesh_io.h       |  2 -
 src/include/mpi_types.h              |  3 +-
 4 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
index ac88fc37..24c75d46 100644
--- a/examples/mesh_decomp/decomp_utils.h
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -2261,31 +2261,31 @@ void partition_mesh(
     // Fill node coordinates
     // coord_recvbuf contains coords in element-node order, but we need them in node order
     // Build a map from node GID to coordinates
-    std::map<int, std::array<double, 3>> node_gid_to_coords;
+    std::map<int, std::vector<double>> node_gid_to_coords;
     int coord_idx = 0;
-    for (int e = 0; e < intermediate_mesh.num_elems; ++e) {
+    for (int e = 0; e < intermediate_mesh.num_elems; e++) {
         for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
             int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j];
             if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
-                node_gid_to_coords[node_gid] = {
-                    coord_recvbuf[coord_idx*3 + 0],
-                    coord_recvbuf[coord_idx*3 + 1],
-                    coord_recvbuf[coord_idx*3 + 2]
-                };
+                std::vector<double> coords(num_dim);
+                for (int d = 0; d < num_dim; d++) {
+                    coords[d] = coord_recvbuf[coord_idx * num_dim + d];
+                }
+                node_gid_to_coords[node_gid] = coords;
             }
             coord_idx++;
         }
     }
     
     // Now fill coordinates in node order
-    intermediate_node.initialize(num_new_nodes, 3, {node_state::coords});
+    intermediate_node.initialize(num_new_nodes, num_dim, {node_state::coords});
     for (int i = 0; i < num_new_nodes; i++) {
         int node_gid = new_node_gids[i];
         auto it = node_gid_to_coords.find(node_gid);
         if (it != node_gid_to_coords.end()) {
-            intermediate_node.coords.host(i, 0) = it->second[0];
-            intermediate_node.coords.host(i, 1) = it->second[1];
-            intermediate_node.coords.host(i, 2) = it->second[2];
+            for (int d = 0; d < num_dim; d++) {
+                intermediate_node.coords.host(i, d) = it->second[d];
+            }
         }
     }
     intermediate_node.coords.update_device();
@@ -2361,6 +2361,7 @@ void partition_mesh(
             value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0);
         }
         value /= final_mesh.num_elems_in_elem(i);
+
         gauss_point.fields_vec(i, 0) = value;
         gauss_point.fields_vec(i, 1) = value;
         gauss_point.fields_vec(i, 2) = value;
@@ -2380,31 +2381,18 @@ void partition_mesh(
     // Test node communication using MPI_Neighbor_alltoallv
     std::vector<node_state> node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field};
     final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan);
-
-    for (int r = 0; r < world_size; r++) {
-            MPI_Barrier(MPI_COMM_WORLD);
-            if (rank == r) {
-                std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
-                std::cout << "[rank " << rank << "]   - Owned elements: " << final_mesh.num_owned_elems << std::endl;
-                std::cout << "[rank " << rank << "]   - Ghost elements: " << final_mesh.num_elems - final_mesh.num_owned_elems << std::endl;
-                std::cout << "[rank " << rank << "]   - Owned nodes: " << final_mesh.num_owned_nodes << std::endl;
-                std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << final_mesh.num_nodes - final_mesh.num_owned_nodes << std::endl;
-                std::cout << std::flush;
-            }
-            MPI_Barrier(MPI_COMM_WORLD);
-        }
     
     for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
         final_node.scalar_field.host(i) = static_cast<double>(rank);
-        final_node.vector_field.host(i, 0) = static_cast<double>(rank);
-        final_node.vector_field.host(i, 1) = static_cast<double>(rank);
-        final_node.vector_field.host(i, 2) = static_cast<double>(rank);
+        for(int dim = 0; dim < num_dim; dim++){
+            final_node.vector_field.host(i, dim) = static_cast<double>(rank);
+        }
     }
     for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
         final_node.scalar_field.host(i) = -100.0;
-        final_node.vector_field.host(i, 0) = -100.0;
-        final_node.vector_field.host(i, 1) = -100.0;
-        final_node.vector_field.host(i, 2) = -100.0;
+        for(int dim = 0; dim < num_dim; dim++){
+            final_node.vector_field.host(i, dim) = -100;
+        }
     }
 
     final_node.coords.update_device();
@@ -2416,38 +2404,35 @@ void partition_mesh(
     node_communication_plan.verify_graph_communicator();
 
     final_node.scalar_field.communicate();
-    // final_node.vector_field.communicate();
-    MPI_Barrier(MPI_COMM_WORLD);
-
-
-    // Update scalar field to visualize the communication
-
-    CArrayKokkos <double> tmp_too(final_mesh.num_elems);
-    FOR_ALL(i, 0, final_mesh.num_elems, {
-
-        double value = 0.0;
-        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            value += final_node.scalar_field(final_mesh.nodes_in_elem(i, j));
-        }
-        value /= final_mesh.num_nodes_in_elem;
-        tmp_too(i) = value;
-    });
+    final_node.vector_field.communicate();
+    
     MATAR_FENCE();
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    FOR_ALL(i, 0, final_mesh.num_elems, {
-        for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) {
-            final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = tmp_too(i);
-        }
-    });
-    MATAR_FENCE();
+    DCArrayKokkos <double> tmp_too(final_mesh.num_nodes);
+    for(int smooth = 0; smooth < 3; smooth++){
+        FOR_ALL(i, 0, final_mesh.num_nodes, {
 
-    MPI_Barrier(MPI_COMM_WORLD);
+            double value = final_node.scalar_field(i);
+            for(int j = 0; j < final_mesh.num_nodes_in_node(i); j++){
+                value += final_node.scalar_field(final_mesh.nodes_in_node(i, j));
+            }
+            value /= final_mesh.num_nodes_in_node(i) + 1;
+            tmp_too(i) = value;
+        });
+        MATAR_FENCE();
 
-    if(rank == 0)std::cout<<"Print from rank 0"<<std::endl;
-    if(rank == 1)std::cout<<"Print from rank 1"<<std::endl;
+        FOR_ALL(i, 0, final_mesh.num_nodes, {
+            final_node.scalar_field(i) = tmp_too(i);
+            for(int dim = 0; dim < num_dim; dim++){
+                final_node.vector_field(i, dim) = tmp_too(i);
+            }
+        });
+        MATAR_FENCE();
+    }
 
-    MATAR_FENCE();
     final_node.scalar_field.update_host();
+
     MATAR_FENCE();
     MPI_Barrier(MPI_COMM_WORLD);
 }
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
index a580052a..c9e143f5 100644
--- a/examples/mesh_decomp/mesh_decomp.cpp
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
     // Mesh size
     double origin[3] = {0.0, 0.0, 0.0};
     double length[3] = {1.0, 1.0, 1.0};
-    int num_elems_dim[3] = {20, 20, 20};
+    int num_elems_dim[3] = {30, 30, 30};
 
     // Initial mesh built on rank zero
     Mesh_t initial_mesh;
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
index 79eec569..aec7a963 100644
--- a/examples/mesh_decomp/mesh_io.h
+++ b/examples/mesh_decomp/mesh_io.h
@@ -197,7 +197,6 @@ void build_3d_box(
     origin_mtr.update_device();
 
     // populate the point data structures
-    std::cout<<"First FOR_ALL"<<std::endl;
     FOR_ALL(k, 0, num_points_k,
             j, 0, num_points_j,
             i, 0, num_points_i,{
@@ -217,7 +216,6 @@ void build_3d_box(
     mesh.initialize_elems(num_elems, num_dim);
 
     // populate the point data structures
-    std::cout<<"Second FOR_ALL"<<std::endl;
     FOR_ALL(k, 0, num_elems_k,
             j, 0, num_elems_j,
             i, 0, num_elems_i,{
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index 4546fd48..aa744678 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -316,8 +316,9 @@ class MPICArrayKokkos {
             recv_displs_.host_pointer(), 
             mpi_type_map<T>::value(),  // MPI_TYPE
             comm_plan_->mpi_comm_graph);
-        
+        MATAR_FENCE();
         copy_recv_buffer();
+        MATAR_FENCE();
 
         this_array_.update_device();
     };

From 614bf4cd0d7fe6b5874a1ab466009718fc582a6f Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 25 Nov 2025 14:49:48 -0600
Subject: [PATCH 49/52] ENH: Tidying up

---
 src/include/communication_plan.h | 23 +++------
 src/include/mpi_types.h          | 86 ++++++++++++++++++++++++--------
 2 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index d0140d43..f6613c10 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -9,25 +9,14 @@
 
 using namespace mtr;
 
-/**
- * @struct CommunicationPlan
- * @brief Manages efficient MPI communication for ghost element and node data exchange
- * 
- * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency.
- * Designed to be embedded in distributed data structures for automatic ghost synchronization.
- * 
- * Usage pattern in distributed structures:
- *   node.velocity.comm()  -> automatically syncs ghost nodes
- *   elem.density.comm()   -> automatically syncs ghost elements
- * 
- */
+
 enum class communication_plan_type {
     no_communication,
     all_to_all_graph
 };
 
 
- struct CommunicationPlan {
+struct CommunicationPlan {
     
     // ========================================================================
     // Metadata for MPI neighbor graph communication 
@@ -220,6 +209,7 @@ enum class communication_plan_type {
         has_comm_graph = true;
     }
 
+    // Useful function for debugging, possibly remove
     void verify_graph_communicator(){
         if(!has_comm_graph){
             throw std::runtime_error("MPI graph communicator has not been initialized");
@@ -320,6 +310,7 @@ enum class communication_plan_type {
         MPI_Barrier(mpi_comm_world);
     }
 
+    // Setup send/receive metadata
     void setup_send_recv(DRaggedRightArrayKokkos<int> &rank_send_ids, DRaggedRightArrayKokkos<int> &rank_recv_ids){
 
         this->send_indices_ = rank_send_ids; // indices of element data to send to each rank
@@ -360,10 +351,10 @@ enum class communication_plan_type {
             }
         }
         this->recv_displs_.update_device();
-
-        MPI_Barrier(mpi_comm_world);
+        MATAR_FENCE();
     }
 
+    // Useful function for debugging, possibly remove
     void verify_send_recv(){
         
         if(!has_comm_graph){
@@ -511,8 +502,6 @@ enum class communication_plan_type {
             throw std::runtime_error("Send/Recv communication plan verification failed");
         }
     }
-
-
 }; // End of CommunicationPlan
 
 #endif // end if HAVE_MPI
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index aa744678..5f83265b 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -83,9 +83,9 @@ class MPICArrayKokkos {
     DCArrayKokkos<T> recv_buffer_;
     
 protected:
-    size_t dims_[7];
-    size_t length_;
-    size_t order_;  // tensor order (rank)
+    size_t dims_[7] = {0,0,0,0,0,0,0};
+    size_t length_ = 0;
+    size_t order_ = 0;  // tensor order (rank)
 
     MPI_Comm mpi_comm_;
     MPI_Status mpi_status_;
@@ -94,7 +94,7 @@ class MPICArrayKokkos {
 
     
     // --- Ghost Communication Support ---
-    CommunicationPlan* comm_plan_;      // Pointer to shared communication plan
+    CommunicationPlan* comm_plan_ = NULL;      // Pointer to shared communication plan
 
 
     DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
@@ -113,7 +113,7 @@ class MPICArrayKokkos {
     size_t num_ghost_;            // Number of ghost items (nodes/elements)
 
 public:
-    // Data member to access host view
+    // Data member to access host view (initialized as pointer to this_array_.host_pointer())
     ViewCArray <T> host;
 
 
@@ -143,8 +143,6 @@ class MPICArrayKokkos {
                  size_t dim3, size_t dim4, size_t dim5,
                  size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
     
-
-
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i) const;
 
@@ -254,6 +252,10 @@ class MPICArrayKokkos {
     // Such that all the boundary elements going to a given rank are contiguous in the send buffer.
     void fill_send_buffer(){
 
+        // Copy this_array_ to the host
+        this_array_.update_host();
+        MATAR_FENCE();
+
         size_t send_idx = 0;
         for(int i = 0; i < comm_plan_->num_send_ranks; i++){
             for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
@@ -284,7 +286,6 @@ class MPICArrayKokkos {
                 recv_idx += stride_;
             }
         }
-        this_array_.update_device();
     };
 
 
@@ -300,10 +301,25 @@ class MPICArrayKokkos {
     // Method that communicates the data between the ranks
     // NOTE: This is a blocking communication operation, 
     // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv
+    
+    // TODO: Replace this with persistent communicator:
+    // MPI_Request req;
+
+    // // Create persistent operation ONCE
+    // MPI_Neighbor_alltoallv_init(
+    //     sendbuf, sendcounts, sdispls, mpi_type_map<T>::value(),
+    //     recvbuf, recvcounts, rdispls, mpi_type_map<T>::value(),
+    //     comm_plan_->mpi_comm_graph,
+    //     MPI_INFO_NULL,
+    //     &req);
+
+    // // Then inside time step loop:
+    // MPI_Start(&req);
+    // // modify sendbuf in-place as needed
+    // MPI_Wait(&req);
+
     void communicate(){
 
-        this_array_.update_host();
-        MATAR_FENCE();
         fill_send_buffer();
 
         MPI_Neighbor_alltoallv(
@@ -316,11 +332,10 @@ class MPICArrayKokkos {
             recv_displs_.host_pointer(), 
             mpi_type_map<T>::value(),  // MPI_TYPE
             comm_plan_->mpi_comm_graph);
-        MATAR_FENCE();
+        
         copy_recv_buffer();
-        MATAR_FENCE();
-
         this_array_.update_device();
+        MATAR_FENCE();
     };
 
     void set_values(const T& value){
@@ -339,7 +354,7 @@ MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
         for (int i = 0; i < 7; i++) {
             dims_[i] = 0;
         }
-    }
+}
 
 // Overloaded 1D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
@@ -507,12 +522,41 @@ T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
 MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPICArrayKokkos& temp) {
-    this_array_ = temp.this_array_;
-    host = temp.host;  // Also copy the host ViewCArray
-    comm_plan_ = temp.comm_plan_;
-    send_buffer_ = temp.send_buffer_;
-    recv_buffer_ = temp.recv_buffer_;
-    stride_ = temp.stride_;
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+
+        this_array_ = temp.this_array_;
+        send_buffer_ = temp.send_buffer_;
+        recv_buffer_ = temp.recv_buffer_;
+
+        length_ = temp.length_;
+
+        for (int iter = 0; iter < temp.order_; iter++){
+            dims_[iter] = temp.dims_[iter];
+        } // end for
+
+        order_ = temp.order_;
+
+        mpi_status_ = temp.mpi_status_;
+        mpi_datatype_ = temp.mpi_datatype_;
+        mpi_request_ = temp.mpi_request_;
+        comm_plan_ = temp.comm_plan_;
+
+        send_counts_ = temp.send_counts_;
+        recv_counts_ = temp.recv_counts_;
+        send_displs_ = temp.send_displs_;
+        recv_displs_ = temp.recv_displs_;
+        stride_ = temp.stride_;
+
+        send_indices_ = temp.send_indices_;
+        recv_indices_ = temp.recv_indices_;
+
+        num_owned_ = temp.num_owned_;
+        num_ghost_ = temp.num_ghost_;
+
+        host = temp.host;  // Also copy the host ViewCArray
+    }
     return *this;
 }
 
@@ -533,7 +577,7 @@ template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits
 KOKKOS_INLINE_FUNCTION
 size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
     assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
-    assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!");
+    assert(dims_[i] > 0 && "Access to MPICArrayKokkos dims is out of bounds!");
     return this_array_.dims(i);
 }
 

From 6156a6bad82a527c7dc7af32962a0af4015f216f Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 25 Nov 2025 15:08:58 -0600
Subject: [PATCH 50/52] BUG: Correct default build script behavior

---
 scripts/build-matar.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/build-matar.sh b/scripts/build-matar.sh
index fa95bc12..30f384dc 100755
--- a/scripts/build-matar.sh
+++ b/scripts/build-matar.sh
@@ -71,7 +71,7 @@ show_help() {
 build_action="full-app"
 execution="examples"
 machine="linux"
-kokkos_build_type="openmp"
+kokkos_build_type="serial"
 build_cores="1"
 trilinos="disabled"
 intel_mkl="disabled"

From 7dee53afc22395b4e012b50b0d4caa2594e6af26 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 25 Nov 2025 15:17:50 -0600
Subject: [PATCH 51/52] ENH: Update minimum cmake version

---
 benchmark/CMakeLists.txt                            | 2 +-
 examples/gArrayofgArrays/CMakeLists.txt             | 2 +-
 examples/halfspace_cooling/CMakeLists.txt           | 2 +-
 examples/laplace/CMakeLists.txt                     | 2 +-
 examples/laplaceMPI/CMakeLists.txt                  | 2 +-
 examples/mesh_decomp/CMakeLists.txt                 | 2 +-
 examples/phaseField/srcKokkosVerbose/CMakeLists.txt | 2 +-
 examples/phaseField/srcMacros/CMakeLists.txt        | 2 +-
 examples/phaseFieldMPI/CMakeLists.txt               | 2 +-
 examples/sparsetests/CMakeLists.txt                 | 2 +-
 examples/test_rocm/CMakeLists.txt                   | 2 +-
 examples/virtualFcnKokkos/CMakeLists.txt            | 2 +-
 examples/virtualFcnMATAR/CMakeLists.txt             | 2 +-
 examples/watt-graph/CMakeLists.txt                  | 2 +-
 test/test_cases/CMakeLists.txt                      | 2 +-
 15 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 372ad21c..0a548973 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 project (matarbenchmark)
 
diff --git a/examples/gArrayofgArrays/CMakeLists.txt b/examples/gArrayofgArrays/CMakeLists.txt
index 33a5fa97..e90dd1da 100644
--- a/examples/gArrayofgArrays/CMakeLists.txt
+++ b/examples/gArrayofgArrays/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/halfspace_cooling/CMakeLists.txt b/examples/halfspace_cooling/CMakeLists.txt
index dbcaa6f9..91bffb75 100644
--- a/examples/halfspace_cooling/CMakeLists.txt
+++ b/examples/halfspace_cooling/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/laplace/CMakeLists.txt b/examples/laplace/CMakeLists.txt
index acbd4a1f..b3122cd0 100644
--- a/examples/laplace/CMakeLists.txt
+++ b/examples/laplace/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/laplaceMPI/CMakeLists.txt b/examples/laplaceMPI/CMakeLists.txt
index 5b114927..d722fac9 100644
--- a/examples/laplaceMPI/CMakeLists.txt
+++ b/examples/laplaceMPI/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 if (KOKKOS)
   #find_package(Kokkos REQUIRED) #new
diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt
index b5ea83ca..6c8901da 100644
--- a/examples/mesh_decomp/CMakeLists.txt
+++ b/examples/mesh_decomp/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 # Find MPI
 find_package(MPI REQUIRED)
diff --git a/examples/phaseField/srcKokkosVerbose/CMakeLists.txt b/examples/phaseField/srcKokkosVerbose/CMakeLists.txt
index 0da1896c..4f473fd7 100644
--- a/examples/phaseField/srcKokkosVerbose/CMakeLists.txt
+++ b/examples/phaseField/srcKokkosVerbose/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 
 if (KOKKOS)
diff --git a/examples/phaseField/srcMacros/CMakeLists.txt b/examples/phaseField/srcMacros/CMakeLists.txt
index 0da1896c..4f473fd7 100644
--- a/examples/phaseField/srcMacros/CMakeLists.txt
+++ b/examples/phaseField/srcMacros/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 
 if (KOKKOS)
diff --git a/examples/phaseFieldMPI/CMakeLists.txt b/examples/phaseFieldMPI/CMakeLists.txt
index 3650430a..4b8c6961 100644
--- a/examples/phaseFieldMPI/CMakeLists.txt
+++ b/examples/phaseFieldMPI/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 #project (phasefield_mpi)
 
diff --git a/examples/sparsetests/CMakeLists.txt b/examples/sparsetests/CMakeLists.txt
index b8e3164d..a0f4c506 100644
--- a/examples/sparsetests/CMakeLists.txt
+++ b/examples/sparsetests/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 if (KOKKOS)
     #find_package(Kokkos REQUIRED)
diff --git a/examples/test_rocm/CMakeLists.txt b/examples/test_rocm/CMakeLists.txt
index 31c4c2e2..564bb7e3 100644
--- a/examples/test_rocm/CMakeLists.txt
+++ b/examples/test_rocm/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 #project (test_rocm)
 
diff --git a/examples/virtualFcnKokkos/CMakeLists.txt b/examples/virtualFcnKokkos/CMakeLists.txt
index b0673270..89f72fab 100644
--- a/examples/virtualFcnKokkos/CMakeLists.txt
+++ b/examples/virtualFcnKokkos/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 #project (virttestkokkos)
 
diff --git a/examples/virtualFcnMATAR/CMakeLists.txt b/examples/virtualFcnMATAR/CMakeLists.txt
index 4e232051..22873a82 100644
--- a/examples/virtualFcnMATAR/CMakeLists.txt
+++ b/examples/virtualFcnMATAR/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/watt-graph/CMakeLists.txt b/examples/watt-graph/CMakeLists.txt
index 9db93716..3061157a 100644
--- a/examples/watt-graph/CMakeLists.txt
+++ b/examples/watt-graph/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 
 if (NOT KOKKOS)
diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt
index 01cc23c0..a0e07edd 100644
--- a/test/test_cases/CMakeLists.txt
+++ b/test/test_cases/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 # Find all test files in the current directory except test_main.cpp
 file(GLOB TEST_SOURCES "test_*.cpp")

From e643d947dea723dd6be205e51c117c9c33ba57c1 Mon Sep 17 00:00:00 2001
From: Jacob Moore <jacoblinleymoore@gmail.com>
Date: Tue, 25 Nov 2025 15:21:36 -0600
Subject: [PATCH 52/52] ENH: Missing cmake update

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8f7fa4c2..e6c2bfaf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 project (matartest)