diff --git a/.gitignore b/.gitignore
index fbdfa9d3..87400105 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ heffte/
 docs_doxygen/
 docs_sphinx/
 tutorial/getting_started/Example0/build_*
-tutorial/getting_started/Example0/install*
\ No newline at end of file
+tutorial/getting_started/Example0/install*
+examples/mesh_decomp/lib/*
\ No newline at end of file
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 372ad21c..0a548973 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 project (matarbenchmark)
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index affcd031..4c379334 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -156,6 +156,9 @@ if (KOKKOS)
   if (MPI)
     include_directories(laplaceMPI)
     add_subdirectory(laplaceMPI)
+
+    include_directories(mesh_decomp)
+    add_subdirectory(mesh_decomp)
   endif()
 
 endif()
@@ -191,11 +194,12 @@ add_subdirectory(sparsetests)
 include_directories(test_rocm)
 add_subdirectory(test_rocm)
 
-#include_directories(phaseField/srcKokkosVerbose)
-#add_subdirectory(phaseField/srcKokkosVerbose)
 
-#include_directories(phaseField/srcMacros)
-#add_subdirectory(phaseField/srcMacros)
+# include_directories(phaseField/srcKokkosVerbose)
+# add_subdirectory(phaseField/srcKokkosVerbose)
+
+# include_directories(phaseField/srcMacros)
+# add_subdirectory(phaseField/srcMacros)
 
-#include_directories(phaseFieldMPI)
-#add_subdirectory(phaseFieldMPI)
+# include_directories(phaseFieldMPI)
+# add_subdirectory(phaseFieldMPI)
diff --git a/examples/gArrayofgArrays/CMakeLists.txt b/examples/gArrayofgArrays/CMakeLists.txt
index 33a5fa97..e90dd1da 100644
--- a/examples/gArrayofgArrays/CMakeLists.txt
+++ b/examples/gArrayofgArrays/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/halfspace_cooling/CMakeLists.txt b/examples/halfspace_cooling/CMakeLists.txt
index dbcaa6f9..91bffb75 100644
--- a/examples/halfspace_cooling/CMakeLists.txt
+++ b/examples/halfspace_cooling/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/laplace/CMakeLists.txt b/examples/laplace/CMakeLists.txt
index acbd4a1f..b3122cd0 100644
--- a/examples/laplace/CMakeLists.txt
+++ b/examples/laplace/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/laplaceMPI/CMakeLists.txt b/examples/laplaceMPI/CMakeLists.txt
index 5b114927..d722fac9 100644
--- a/examples/laplaceMPI/CMakeLists.txt
+++ b/examples/laplaceMPI/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 if (KOKKOS)
   #find_package(Kokkos REQUIRED) #new
diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt
new file mode 100644
index 00000000..6c8901da
--- /dev/null
+++ b/examples/mesh_decomp/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.5)
+
+# Find MPI
+find_package(MPI REQUIRED)
+add_definitions(-DHAVE_MPI=1)
+
+find_package(Matar REQUIRED)
+
+execute_process(
+  COMMAND ${CMAKE_CURRENT_LIST_DIR}/install_ptscotch.sh
+  WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+  RESULT_VARIABLE INSTALL_PTSCOTCH_RESULT
+)
+
+if(NOT INSTALL_PTSCOTCH_RESULT EQUAL 0)
+  message(FATAL_ERROR "Failed to install PT-Scotch by running install_ptscotch.sh")
+endif()
+
+
+if (KOKKOS)
+  #find_package(Kokkos REQUIRED) #new
+  
+  add_executable(mesh_decomp mesh_decomp.cpp)
+
+  add_definitions(-DHAVE_KOKKOS=1)
+
+  # Add include directories for MPI and Scotch/PT-Scotch
+  target_include_directories(mesh_decomp PRIVATE ${MPI_CXX_INCLUDE_PATH} ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/include)
+  
+  # Link libraries - order matters! libptscotch depends on libscotch
+  # Use -Wl,--whole-archive to ensure all symbols are included from static libraries
+  # Note: Only link libptscotcherr.a (not libscotcherr.a) to avoid multiple definitions
+  target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX 
+    -Wl,--whole-archive
+    ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libscotch.a
+    -Wl,--no-whole-archive
+    -Wl,--whole-archive
+    ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotcherr.a
+    ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotch.a
+    -Wl,--no-whole-archive
+    -lz     # zlib for gzip compression
+    -lbz2   # bzip2 library
+    -llzma  # xz compression library
+  )
+endif()
diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h
new file mode 100644
index 00000000..24c75d46
--- /dev/null
+++ b/examples/mesh_decomp/decomp_utils.h
@@ -0,0 +1,2440 @@
+#ifndef DECOMP_UTILS_H
+#define DECOMP_UTILS_H
+
+#include <iostream>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <memory>
+#include <mpi.h>
+#include <set>
+#include <map>
+#include <unordered_set>
+
+
+#include "mesh.h"
+#include "state.h"
+#include "mesh_io.h"
+#include "communication_plan.h"
+
+
+// Include Scotch headers
+#include "scotch.h"
+#include "ptscotch.h"
+
+/**
+ * @brief Partitions the input mesh into a naive element-based decomposition across MPI ranks.
+ *
+ * This function splits the input mesh (and its associated node information) evenly among the given number of MPI ranks.
+ * It assigns contiguous blocks of elements (and the corresponding nodes and nodal data) to each rank.
+ * 
+ * The function constructs:
+ * - The sub-mesh (naive_mesh) and its nodes (naive_node) for the local rank.
+ * - Maps and vectors indicating elements and nodes present on each rank.
+ * - Auxiliary arrays (elems_in_elem_on_rank, num_elems_in_elem_per_rank) for local element connectivity and neighbor look-ups.
+ *
+ * The decomposition is "naive" in that it uses a simple contiguous block assignment, without regard to mesh topology or quality of partitioning.
+ * This function is generally used as the preliminary step before repartitioning with tools like PT-Scotch or for algorithm prototyping.
+ *
+ * @param initial_mesh[in]         The input mesh containing all elements/nodes on rank 0.
+ * @param initial_node[in]         The nodal data for the input mesh on rank 0.
+ * @param naive_mesh[out]          The mesh on this rank after naive partitioning.
+ * @param naive_node[out]          The nodal data on this rank after naive partitioning.
+ * @param elems_in_elem_on_rank[out]   Vector of element-to-element connectivity for this rank's local mesh.
+ * @param num_elems_in_elem_per_rank[out] Vector of counts for element neighbors for each local element.
+ * @param world_size[in]           Number of MPI ranks (world size).
+ * @param rank[in]                 This MPI rank's id.
+ */
+
+void naive_partition_mesh(
+    Mesh_t& initial_mesh,
+    node_t& initial_node,
+    Mesh_t& naive_mesh,
+    node_t& naive_node,
+    CArrayDual<int>& elems_in_elem_on_rank,
+    CArrayDual<int>& num_elems_in_elem_per_rank,
+    int world_size,
+    int rank)
+{
+
+    bool print_info = false;
+
+    int num_elements_on_rank = 0;
+    int num_nodes_on_rank = 0;
+    int num_nodes_per_elem = 0;
+    int num_dim = initial_mesh.num_dims;
+
+
+    // Compute the number of elements to send to each rank and num_nodes_per_elem
+    std::vector<int> elems_per_rank(world_size); // number of elements to send to each rank size(world_size)
+    if (rank == 0) {
+
+        num_nodes_per_elem = initial_mesh.num_nodes_in_elem;
+
+        // Compute elements to send to each rank; handle remainders for non-even distribution
+        std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size);
+        int remainder = initial_mesh.num_elems % world_size;
+        for (int i = 0; i < remainder; i++) {
+            elems_per_rank[i] += 1;
+        }
+    }
+
+    // Broadcasts the value of num_nodes_per_elem from the root rank (0) to all other ranks in MPI_COMM_WORLD.
+    // After this call, all ranks will have the same value for num_nodes_per_elem.
+    MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); 
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ********************************************************  
+    //        Scatter the number of elements to each rank
+    // ******************************************************** 
+    // All ranks participate in the scatter operation
+    // MPI_Scatter signature:
+    // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+    //             void *recvbuf, int recvcount, MPI_Datatype recvtype,
+    //             int root, MPI_Comm comm)
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, 
+                &num_elements_on_rank, 1, MPI_INT, 
+                0, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    // Vector of element to send to each rank using a naive partitioning (0-m, m-n, n-o, etc.)
+    std::vector<int> elements_on_rank(num_elements_on_rank);  
+
+
+    // ********************************************************  
+    //     Scatter the actual element global ids to each rank
+    // ******************************************************** 
+
+    // create a 2D vector of elements to send to each rank
+    std::vector<std::vector<int>> elements_to_send(world_size);
+    if (rank == 0) {
+
+        // Populate the elements_to_send array by finding all elements in the elements_per_rank array and adding them to the elements_to_send array
+        int elem_gid = 0;
+        for (int rank = 0; rank < world_size; rank++) {
+            for (int j = 0; j < elems_per_rank[rank]; j++) {
+                elements_to_send[rank].push_back(elem_gid);
+                elem_gid++;
+            }
+        }
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D elements_to_send into a 1D array
+        std::vector<int> all_elements; // array of all elements to be sent to each rank
+        std::vector<int> sendcounts(world_size); // array of the number of elements to send to each rank
+        std::vector<int> displs(world_size); // array of the displacement for each rank in the flattened array
+        
+        int displacement = 0; // displacement is the starting index of the elements for the current rank in the flattened array
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = elems_per_rank[i]; // number of elements to send to each rank
+            displs[i] = displacement; // displacement for each rank in the flattened array
+            // Copy elements for rank i to the flattened array
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                all_elements.push_back(elements_to_send[i][j]); // add the elements to the flattened array
+            }
+            displacement += elems_per_rank[i]; // increment the displacement by the number of elements to send to the next rank
+        }
+
+        // Send the elements to each rank
+        // all_elements.data(): Pointer to the flattened array of all elements to be sent to each rank
+        // sendcounts.data(): Array with the number of elements to send to each rank
+        // displs.data(): Array with the displacement for each rank in the flattened array
+        // MPI_INT: Data type of the elements (integer)
+        // elements_on_rank.data(): Pointer to the buffer where each rank will receive its elements
+        // num_elements_on_rank: Number of elements that the receiving rank expects to receive
+        // MPI_INT: Data type of the receive buffer (integer)
+        // 0: The root rank (rank 0) that is performing the scatter
+        // MPI_COMM_WORLD: The communicator
+        MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT,
+                    elements_on_rank.data(), num_elements_on_rank, MPI_INT,
+                    0, MPI_COMM_WORLD);
+    } 
+    else {
+        // If the rank is not the root rank, it will receive nullptr for the sendbuf, sendcounts, and displs arrays
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                    elements_on_rank.data(), num_elements_on_rank, MPI_INT,
+                    0, MPI_COMM_WORLD);
+    }
+
+    // Wait for all ranks to complete the scatter operation
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ****************************************************************************************** 
+    //     Scatter the number of nodes to each rank and compute which nodes to send to each rank
+    // ****************************************************************************************** 
+    std::vector<int> nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size)
+    std::vector<int> nodes_on_rank; // node gids the current rank
+    std::vector<std::vector<int>> nodes_to_send(world_size); // nodes to send to each rank
+
+    if (rank == 0) {
+
+        // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates    
+        for (int i = 0; i < world_size; i++) {      
+            std::set<int> nodes_set;
+            for (int j = 0; j < elems_per_rank[i]; j++) {
+                for (int k = 0; k < num_nodes_per_elem; k++) {
+                    nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k));
+                }
+            }
+            nodes_to_send[i] = std::vector<int>(nodes_set.begin(), nodes_set.end());
+        } 
+
+        for (int i = 0; i < world_size; i++) {
+            nodes_per_rank[i] = nodes_to_send[i].size();
+        }
+    }
+
+    // Send the number of nodes to each rank using MPI_scatter
+    MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); 
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // resize the nodes_on_rank vector to hold the received data
+    nodes_on_rank.resize(num_nodes_on_rank);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ****************************************************************************************** 
+    //     Scatter the actual node global ids to each rank
+    // ****************************************************************************************** 
+    if (rank == 0) {
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D nodes_to_send into a 1D array
+        std::vector<int> all_nodes;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = nodes_to_send[i].size();
+            displs[i] = displacement;
+            // Copy nodes for rank i to the flattened array
+            for (int j = 0; j < nodes_to_send[i].size(); j++) {
+                all_nodes.push_back(nodes_to_send[i][j]);
+            }
+            displacement += nodes_to_send[i].size();
+        }
+        // Send the nodes to each rank
+        // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank
+        // sendcounts.data(): Array with the number of nodes to send to each rank
+        // displs.data(): Array with the displacement for each rank in the flattened array
+        // MPI_INT: Data type of the nodes (integer)
+        // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes
+        // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive
+        // MPI_INT: Data type of the receive buffer (integer)
+        // 0: The root rank (rank 0) that is performing the scatter
+        // MPI_COMM_WORLD: The communicator
+        MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+            nodes_on_rank.data(), num_nodes_on_rank, MPI_INT,
+            0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ****************************************************************************************** 
+    //     Scatter the node positions to each rank
+    // ****************************************************************************************** 
+    // Create a flat 1D vector for node positions (num_dim coordinates per node)
+    std::vector<double> node_pos_on_rank_flat(num_nodes_on_rank * num_dim);
+    CArrayDual<double> node_pos_on_rank(num_nodes_on_rank, num_dim, "node_pos_on_rank_decomp");
+    
+    if(rank == 0){
+
+        // Prepare data for MPI_Scatterv (scatter with variable counts)
+        // Flatten the 2D node_pos_to_send into a 1D array
+        std::vector<double> all_node_pos;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+        for (int i = 0; i < world_size; i++) {
+            sendcounts[i] = nodes_to_send[i].size() * num_dim;
+            displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array
+            // Copy node positions for rank i to the flattened array
+            for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) {
+                for(int dim = 0; dim < num_dim; dim++) {
+                    all_node_pos.push_back(initial_node.coords.host(nodes_to_send[i][node_gid], dim));
+                }
+            }
+            displacement += nodes_to_send[i].size() * num_dim;
+        }   
+
+        // Send the node positions to each rank
+        MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE,
+                     node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE,
+                     node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE,
+                     0, MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    node_pos_on_rank.update_device();
+
+    // ****************************************************************************************** 
+    //     Initialize the node state variables
+    // ****************************************************************************************** 
+
+    // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
+    std::vector<node_state> required_node_state = { node_state::coords };
+    naive_node.initialize(num_nodes_on_rank, num_dim, required_node_state);
+
+    FOR_ALL(node_id, 0, num_nodes_on_rank,
+            dim, 0, num_dim,{
+        naive_node.coords(node_id, dim) = node_pos_on_rank(node_id, dim);
+    });
+    MATAR_FENCE();
+
+    naive_node.coords.update_host();
+
+    // ****************************************************************************************** 
+    //     Send the element-node connectivity data from the initial mesh to each rank
+    // ****************************************************************************************** 
+
+    // Send the element-node connectivity data from the initial mesh to each rank
+    std::vector<int> nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+  
+
+    // Instead of staging a full copy of the connectivity data per-rank, compute the
+    // scatter counts/displacements directly from the contiguous global array.
+    std::vector<int> conn_sendcounts(world_size);
+    std::vector<int> conn_displs(world_size);
+    int conn_displacement = 0;
+    for (int i = 0; i < world_size; i++) {
+        conn_sendcounts[i] = elems_per_rank[i] * num_nodes_per_elem;
+        conn_displs[i] = conn_displacement;
+        conn_displacement += conn_sendcounts[i];
+    }
+
+    // Scatter using the native storage type (size_t) and then convert locally to int
+    size_t* global_nodes_in_elem = nullptr;
+    if (rank == 0) {
+        global_nodes_in_elem = initial_mesh.nodes_in_elem.host_pointer();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    { //scope to free memory for tmp vector
+        std::vector<size_t> nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem);
+
+        MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG,
+                    nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG,
+                    0, MPI_COMM_WORLD);
+
+        for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) {
+            nodes_in_elem_on_rank[idx] = static_cast<int>(nodes_in_elem_on_rank_size_t[idx]);
+        }
+    }
+
+    // ****************************************************************************************** 
+    //     Send the element-element connectivity data from the initial mesh to each rank
+    // ****************************************************************************************** 
+
+    // First, rank 0 computes how many connectivity entries each rank will receive
+    // and scatters that information
+    int total_elem_elem_entries = 0;
+
+    std::vector<int> elem_elem_counts(world_size);
+    
+    if (rank == 0){
+
+        DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
+        FOR_ALL(i, 0, initial_mesh.num_elems, {
+            tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
+        });
+        tmp_num_elems_in_elem.update_host();
+        MATAR_FENCE();
+        // Calculate total number of connectivity entries for each rank
+        for(int i = 0; i < world_size; i++) {
+            elem_elem_counts[i] = 0;
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                elem_elem_counts[i] += tmp_num_elems_in_elem.host(elements_to_send[i][k]);
+            }
+        }
+    }
+    
+    // Define total_elem_elem_entries to be the sum of the elem_elem_counts
+    // Scatter the counts to each rank
+    MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT,
+                &total_elem_elem_entries, 1, MPI_INT,
+                0, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout<< " Finished scatter" <<std::endl;
+
+    elems_in_elem_on_rank = CArrayDual<int>(total_elem_elem_entries, "elems_in_elem_on_rank");
+
+    // Now scatter the num_elems_in_elem for each element on each rank
+    num_elems_in_elem_per_rank = CArrayDual<int>(num_elements_on_rank, "num_elems_in_elem_per_rank");
+    
+    if (rank == 0) {
+        std::vector<int> all_num_elems_in_elem;
+        std::vector<int> displs_ee(world_size);
+        int displacement = 0;
+
+        DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
+        FOR_ALL(i, 0, initial_mesh.num_elems, {
+            tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
+        });
+        tmp_num_elems_in_elem.update_host();
+        MATAR_FENCE();
+        
+        for(int i = 0; i < world_size; i++) {
+            displs_ee[i] = displacement;
+
+            std::cout<< "Rank = "<< i <<std::endl;
+
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                all_num_elems_in_elem.push_back(tmp_num_elems_in_elem.host(elements_to_send[i][k]));
+            }
+
+            std::cout<< " Finished all_num_elem_elem" <<std::endl;
+            displacement += elements_to_send[i].size();
+        }
+        
+        MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT,
+                     num_elems_in_elem_per_rank.host_pointer(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    } else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     num_elems_in_elem_per_rank.host_pointer(), num_elements_on_rank, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+
+    num_elems_in_elem_per_rank.update_device();
+
+    if (rank == 0){
+
+        std::cout<<"Sending connectivity"<<std::endl;
+        // Prepare the element-element connectivity data for each rank
+        std::vector<int> all_elems_in_elem;
+        std::vector<int> sendcounts(world_size);
+        std::vector<int> displs(world_size);
+        
+        int displacement = 0;
+
+        DRaggedRightArrayKokkos<size_t> tmp_elems_in_elem(initial_mesh.num_elems_in_elem, "temp_elem_in_elem");
+
+        FOR_ALL(elem_gid, 0, initial_mesh.num_elems, {
+            for (size_t i = 0; i < initial_mesh.num_elems_in_elem(elem_gid); i++) {
+                tmp_elems_in_elem(elem_gid, i) = initial_mesh.elems_in_elem(elem_gid, i);
+            } // end for i
+        });  // end FOR_ALL elems
+        MATAR_FENCE();
+        tmp_elems_in_elem.update_host();
+
+
+
+        DCArrayKokkos<size_t> tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); 
+        FOR_ALL(i, 0, initial_mesh.num_elems, {
+            tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i);
+        });
+        MATAR_FENCE();
+        tmp_num_elems_in_elem.update_host();
+        
+        
+        for(int i = 0; i < world_size; i++) {
+            sendcounts[i] = elem_elem_counts[i];
+            displs[i] = displacement;
+            
+            // Copy element-element connectivity for rank i
+            for(int k = 0; k < elements_to_send[i].size(); k++) {
+                for(int l = 0; l < tmp_num_elems_in_elem.host(elements_to_send[i][k]); l++) {
+                    all_elems_in_elem.push_back(tmp_elems_in_elem.host(elements_to_send[i][k], l));
+                }
+            }
+            displacement += elem_elem_counts[i];
+        }
+
+        // Send the element-element connectivity data to each rank using MPI_Scatterv
+        MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT,
+                     elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT,
+                     elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT,
+                     0, MPI_COMM_WORLD);
+    }
+
+    elems_in_elem_on_rank.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ****************************************************************************************** 
+    //     Initialize the naive_mesh data structures for each rank
+    // ****************************************************************************************** 
+    naive_mesh.initialize_nodes(num_nodes_on_rank);
+    naive_mesh.initialize_elems(num_elements_on_rank, num_dim);
+
+    naive_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_nodes_on_rank, "naive_mesh.local_to_global_node_mapping");
+    naive_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_elements_on_rank, "naive_mesh.local_to_global_elem_mapping");
+
+    for(int i = 0; i < num_nodes_on_rank; i++) {
+        naive_mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i];
+    }   
+
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        naive_mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i];
+    }
+
+    naive_mesh.local_to_global_node_mapping.update_device();
+    naive_mesh.local_to_global_elem_mapping.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Timer for reverse mapping of element-node connectivity
+    double t_reverse_map_start = MPI_Wtime();
+
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < num_elements_on_rank; i++) {
+        for(int j = 0; j < num_nodes_per_elem; j++) {
+            int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j];
+
+            int node_lid = -1;
+
+            // Use binary search to find the local node index for node_gid, local_to_global_node_mapping is sorted
+            int left = 0, right = num_nodes_on_rank - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = naive_mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
+                    break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
+                }
+            }
+
+            naive_mesh.nodes_in_elem.host(i, j) = node_lid;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    double t_reverse_map_end = MPI_Wtime();
+    if(rank == 0 && print_info) {
+        std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<<std::endl;
+        std::cout<<" Reverse mapping time: " << (t_reverse_map_end - t_reverse_map_start) << " seconds." << std::endl;
+    }
+
+    naive_mesh.nodes_in_elem.update_device();
+
+    // ****************************************************************************************** 
+    //     Build the connectivity for the local naive_mesh
+    // ****************************************************************************************** 
+    naive_mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+
+    return;
+}
+
+/// @brief Builds ghost elements and nodes for distributed mesh decomposition.
+///
+/// In distributed memory parallel computing with MPI, each rank owns a subset of the mesh.
+/// Ghost elements and nodes are copies of elements/nodes from neighboring ranks that share
+/// nodes with the locally-owned elements. This function identifies and extracts these ghost
+/// entities to enable inter-rank communication and maintain consistency at domain boundaries.
+///
+/// The algorithm operates in 5 primary steps:
+///  1. Gather element ownership information from all ranks using MPI_Allgatherv
+///  2. Collect local element-node connectivity for distribution
+///  3. Broadcast connectivity information to all ranks via MPI collective operations
+///  4. Identify which remote elements touch local elements (by shared nodes)
+///  5. Extract the full connectivity data for identified ghost elements and their nodes
+///
+/// @param[in] input_mesh The locally-owned mesh on this rank containing local elements/nodes
+/// @param[out] output_mesh The enriched mesh with ghost elements and nodes added to local mesh
+/// @param[in] input_node Node data associated with the input mesh
+/// @param[out] output_node Node data extended with ghost nodes
+/// @param[in,out] element_communication_plan MPI communication plan specifying which ranks
+///                                            exchange element data (populated by this function)
+/// @param[in] world_size Total number of MPI ranks
+/// @param[in] rank Current MPI rank (process ID)
+///
+/// @note This is a collective MPI operation - all ranks must call this function together.
+/// @note Uses data-oriented programming patterns with device-accessible arrays (MATAR containers)
+/// @note Performance: O(n_local_elements * n_nodes_per_element) for local operations,
+///                    plus O(n_global_elements) for global MPI collective operations
+void build_ghost(
+    Mesh_t& input_mesh,
+    Mesh_t& output_mesh,
+    node_t& input_node,
+    node_t& output_node,
+    CommunicationPlan& element_communication_plan,
+    CommunicationPlan& node_communication_plan,
+    int world_size,
+    int rank)
+{
+    bool print_info = false;
+    // ****************************************************************************************** 
+    //     Build the ghost elements and nodes
+    // ================================================================================================**
+    //
+    // OVERVIEW OF GHOST ELEMENT IDENTIFICATION:
+    // ==========================================
+    // In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh
+    // elements. However, to perform computations that depend on element neighbors or to maintain
+    // consistency at domain boundaries, we need ghost elements: copies of elements from neighboring
+    // ranks that share nodes with our locally-owned elements.
+    //
+    // This algorithm identifies and extracts ghost element data in 5 steps:
+    //  1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv)
+    //  2. Collect local element-node connectivity for distribution
+    //  3. Broadcast connectivity to all ranks (via MPI_Allgatherv)
+    //  4. Identify which remote elements touch our local elements
+    //  5. Extract the full connectivity data for identified ghost elements
+
+    // ========================================================================
+    // STEP 1: Gather element ownership information from all ranks
+    // ========================================================================
+    // In a distributed mesh, each rank owns a subset of elements. To identify
+    // ghost elements (elements from other ranks needed by this rank), we need
+    // to know which rank owns each element. This section uses MPI collective
+    // operations to gather element GID ownership information.
+    //
+    // MPI COLLECTIVE OPERATIONS EXPLAINED:
+    // ====================================
+    // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point
+    // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank
+    //   Input: Each rank provides local data
+    //   Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...)
+    // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data
+    //   Input: Each rank provides data of potentially different sizes
+    //   Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes
+    //
+    // COMMUNICATION PATTERN VISUALIZATION:
+    // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...]
+    // Rank 1: elem_count[1] /
+    // Rank 2: elem_count[2] /
+
+    int num_dim = input_mesh.num_dims;
+
+    int nodes_per_elem = input_mesh.num_nodes_in_elem;
+
+    // MPI_Allgather: Each rank sends its element count, every rank receives
+    // the count from every other rank. Result: elem_counts[r] = number of
+    // elements owned by rank r.
+    std::vector<int> elem_counts(world_size);
+    MPI_Allgather(&input_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);  // Synchronize all ranks before proceeding
+
+    // Compute displacements: offset into the global array for each rank's data
+    // Example: if elem_counts = [100, 150, 120], then
+    // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids)
+    std::vector<int> elem_displs(world_size);
+    int total_elems = 0;
+    for (int r = 0; r < world_size; r++) {
+        elem_displs[r] = total_elems;
+        total_elems += elem_counts[r];
+    }
+
+    // MPI_Allgatherv: Gather variable-sized data from all ranks into one array
+    // Each rank contributes its local_to_global_elem_mapping, which maps
+    // local element indices to global element GIDs. After this call,
+    // all_elem_gids contains ALL element GIDs from all ranks, organized by rank.
+    std::vector<size_t> all_elem_gids(total_elems);
+    MPI_Allgatherv(input_mesh.local_to_global_elem_mapping.host_pointer(), input_mesh.num_elems, MPI_UNSIGNED_LONG_LONG,
+                all_elem_gids.data(), elem_counts.data(), elem_displs.data(), 
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Build a lookup map: element GID -> owning rank
+    // This allows O(log n) lookups to determine which rank owns any given element.
+    std::map<size_t, int> elem_gid_to_rank;
+    for (int rank_id = 0; rank_id < world_size; rank_id++) {
+        for (int i = 0; i < elem_counts[rank_id]; i++) {
+            size_t gid = all_elem_gids[elem_displs[rank_id] + i];
+            elem_gid_to_rank[gid] = rank_id;
+        }
+    }
+
+    // ========================================================================
+    // STEP 2: Build index sets for local elements and nodes
+    // ========================================================================
+    std::set<size_t> local_node_gids;
+    std::map<size_t, int> global_to_local_node_mapping;  // GID -> local index mapping
+    for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) {
+        size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid);
+        local_node_gids.insert(node_gid);
+        global_to_local_node_mapping[node_gid] = node_rid;
+    }
+
+    // Build a set of locally-owned element GIDs for quick lookup
+    std::set<size_t> local_elem_gids;
+    for (int i = 0; i < input_mesh.num_elems; i++) {
+        local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i));
+    }
+
+    // ========================================================================
+    // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv
+    // ========================================================================
+    // Build a flattened connectivity array: pairs of (elem_gid, node_gid)
+    // Example for 2 elements with 8 nodes each:
+    //   elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+    //
+    // This format is chosen because it's easy to serialize and deserialize over MPI,
+    // and allows us to reconstruct the full element-node relationships.
+    std::vector<size_t> elem_node_conn;
+    int local_conn_size = 0;
+
+    // For each locally-owned element, record its GID and all its node GIDs
+    for (int lid = 0; lid < input_mesh.num_elems; lid++) {
+        size_t elem_gid = input_mesh.local_to_global_elem_mapping.host(lid);
+        
+        // Access nodes_in_elem[lid][*] to get all nodes in this element
+        for (int j = 0; j < input_mesh.num_nodes_in_elem; j++) {
+            size_t node_lid = input_mesh.nodes_in_elem.host(lid, j);  // Local index
+            size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid);  // Global index
+            
+            elem_node_conn.push_back(elem_gid);
+            elem_node_conn.push_back(node_gid);
+        }
+        local_conn_size += nodes_per_elem * 2;  // Each element contributes (num_nodes_in_elem * 2) size_ts
+    }
+
+
+
+    // ========================================================================
+    // Perform MPI communication to gather connectivity from all ranks
+    // ========================================================================
+    // Similar to Step 1, we use MPI_Allgatherv to collect all element-node
+    // connectivity pairs. This is a two-stage process:
+    // 1) Gather the size of each rank's connectivity data
+    // 2) Gather the actual connectivity data with proper offsets
+
+    // Stage 1: Gather connectivity sizes from each rank
+    // conn_sizes[r] = number of size_t values that rank r will send
+    std::vector<int> conn_sizes(world_size);
+    MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Compute displacements for the second MPI_Allgatherv call
+    // Displcements tell each rank where its data should be placed in the global array
+    std::vector<int> conn_displs(world_size);
+    int total_conn = 0;
+    for (int r = 0; r < world_size; r++) {
+        conn_displs[r] = total_conn;
+        total_conn += conn_sizes[r];
+    }
+
+    // Stage 2: Gather all element-node connectivity data
+    // After this call, all_conn contains the flattened connectivity from every rank,
+    // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r])
+    std::vector<size_t> all_conn(total_conn);
+    MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG,
+                all_conn.data(), conn_sizes.data(), conn_displs.data(),
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // ========================================================================
+    // STEP 4: Identify ghost elements
+    // ========================================================================
+    // A ghost element is an element owned by another rank that shares at least
+    // one node with our locally-owned elements. This step identifies all such elements.
+
+    
+    // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us)
+    std::set<size_t> ghost_elem_gids;
+    std::set<size_t> ghost_node_gids;
+
+    std::map<size_t, int> ghost_node_recv_rank;
+
+    // Iterate through connectivity data from each rank (except ourselves)
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Offset into all_conn for this pair (elem_gid, node_gid)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // Check if this node belongs to one of our locally-owned elements
+            if (local_node_gids.find(node_gid) != local_node_gids.end()) {
+                
+                // Check if this element is NOT owned by us (i.e., it's from another rank)
+                if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) {
+                    // This is a ghost element for us
+                    ghost_elem_gids.insert(elem_gid);
+                }
+            }
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    std::map<int, std::set<size_t>> ghost_nodes_from_ranks;
+
+    // Iterate through connectivity data from each rank (except ourselves)
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Offset into all_conn for this pair (elem_gid, node_gid)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // Check if this element belongs to one of our ghost elements
+            if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) {
+                
+                // Check if this node is NOT owned by us (i.e., it's from another rank)
+                if (local_node_gids.find(node_gid) == local_node_gids.end()) {
+                    // This is a ghost node for us
+                    ghost_node_gids.insert(node_gid);
+                    ghost_node_recv_rank[node_gid] = r;
+                    ghost_nodes_from_ranks[r].insert(node_gid);
+                }
+            }
+        }
+    }
+
+    std::set<size_t> shared_nodes; // nodes on MPI rank boundaries
+    // Iterate through connectivity data from each rank (except ourselves) to find shared nodes
+    for (int r = 0; r < world_size; r++) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (r == rank) continue;  // Skip our own data - we already know our elements
+        
+        // Parse the connectivity data for rank r
+        // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...]
+        // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Offset into all_conn for this pair (elem_gid, node_gid)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // Check if this element belongs to one of our ghost elements
+            if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) {
+                // If another rank references a node that is also owned by us, it is a shared node
+                if (local_node_gids.find(node_gid) != local_node_gids.end()) {
+                    shared_nodes.insert(node_gid);
+                    
+                }
+            }
+        }
+    }
+
+    // Create a vecor of the ranks that this rank will receive data from for ghost nodes
+    std::set<int> ghost_node_receive_ranks;
+    for (const auto& pair : ghost_node_recv_rank) {
+        ghost_node_receive_ranks.insert(pair.second);
+    }
+
+    std::vector<int> ghost_node_receive_ranks_vec(ghost_node_receive_ranks.begin(), ghost_node_receive_ranks.end());
+
+    
+    // Find which nodes *we own* are ghosted on other ranks, and on which ranks
+    // We want: for each of our local nodes, the list of ranks that ghost it
+    
+    // Map: local_node_gid -> set of remote ranks that ghost this node
+    std::map<size_t, std::set<int>> local_node_gid_to_ghosting_ranks;
+
+    std::vector<std::set<size_t>> shared_nodes_on_ranks(world_size);
+    
+    // Iterate through connectivity from all ranks except ourselves
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue; // skip our own rank
+        
+        int num_pairs = conn_sizes[r] / 2;
+        for (int i = 0; i < num_pairs; i++) {
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this node is owned by us, and remote rank references it, they are ghosting it
+            if (local_node_gids.find(node_gid) != local_node_gids.end()) {
+                local_node_gid_to_ghosting_ranks[node_gid].insert(r);
+                shared_nodes_on_ranks[r].insert(node_gid);
+            }
+        }
+    }
+
+    // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes
+    std::set<int> ghost_node_send_ranks;
+    for (const auto& pair : local_node_gid_to_ghosting_ranks) {
+        ghost_node_send_ranks.insert(pair.second.begin(), pair.second.end());
+    }
+    std::vector<int> ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end());
+
+    // Store the count of ghost elements for later use
+    input_mesh.num_ghost_elems = ghost_elem_gids.size();
+    input_mesh.num_ghost_nodes = ghost_node_gids.size();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    // ========================================================================
+    // STEP 5: Extract ghost element connectivity
+    // ========================================================================
+    // Now that we know which elements are ghosts, we need to extract their
+    // full node connectivity from all_conn. This allows us to properly construct
+    // the extended mesh with ghost elements included.
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl;
+
+    // Build a map: ghost_elem_gid -> vector of node_gids
+    // We pre-allocate the vector size to avoid repeated reallocations
+    std::map<size_t, std::vector<size_t>> ghost_elem_to_nodes;
+    for (const size_t& ghost_gid : ghost_elem_gids) {
+        ghost_elem_to_nodes[ghost_gid].reserve(input_mesh.num_nodes_in_elem);
+    }
+
+    // ========================================================================
+    // Extract nodes for each ghost element from the globally-collected all_conn
+    // ========================================================================
+    // The all_conn array was populated by MPI_Allgatherv and contains connectivity
+    // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse
+    // this data to extract the nodes for each ghost element.
+    for (int r = 0; r < world_size; r++) {
+        if (r == rank) continue;  // Skip our own data - we already have owned element connectivity
+        
+        // Parse connectivity data for rank r
+        int num_pairs = conn_sizes[r] / 2;
+        
+        for (int i = 0; i < num_pairs; i++) {
+            // Calculate offset for this pair: displacement + (pair_index * 2)
+            int offset = conn_displs[r] + i * 2;
+            size_t elem_gid = all_conn[offset];
+            size_t node_gid = all_conn[offset + 1];
+            
+            // If this element is one of our identified ghost elements, record its node
+            auto it = ghost_elem_to_nodes.find(elem_gid);
+            if (it != ghost_elem_to_nodes.end()) {
+                it->second.push_back(node_gid);
+            }
+        }
+    }
+
+    // ========================================================================
+    // Validation: Verify each ghost element has the correct number of nodes
+    // ========================================================================
+    // This catch detects issues in the MPI communication or parsing logic
+    for (auto& pair : ghost_elem_to_nodes) {
+        if (pair.second.size() != static_cast<size_t>(input_mesh.num_nodes_in_elem)) {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first 
+                    << " has " << pair.second.size() << " nodes, expected " << input_mesh.num_nodes_in_elem << std::endl;
+        }
+    }
+
+    // Step 2: Build extended node list (owned nodes first, then ghost-only nodes)
+    // Start with owned nodes
+    std::map<size_t, int> node_gid_to_extended_lid;
+    int extended_node_lid = 0;
+
+    // Add all owned nodes
+    for (int i = 0; i < input_mesh.num_nodes; i++) {
+        size_t node_gid = input_mesh.local_to_global_node_mapping.host(i);
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
+
+    // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements)
+    std::set<size_t> ghost_only_nodes;
+    for (const auto& pair : ghost_elem_to_nodes) {
+        for (size_t node_gid : pair.second) {
+            // Check if we already have this node
+            if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) {
+                ghost_only_nodes.insert(node_gid);
+            }
+        }
+    }
+
+    // Assign extended local IDs to ghost-only nodes
+    for (size_t node_gid : ghost_only_nodes) {
+        node_gid_to_extended_lid[node_gid] = extended_node_lid++;
+    }
+
+    int total_extended_nodes = extended_node_lid;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later)
+    // Build request list: for each ghost node, find an owning rank via any ghost element that contains it
+    std::map<int, std::vector<size_t>> rank_to_ghost_node_requests;
+    for (size_t node_gid : ghost_only_nodes) {
+        // Find which rank owns an element containing this node
+        // Look through ghost elements
+        for (const auto& pair : ghost_elem_to_nodes) {
+            size_t ghost_elem_gid = pair.first;
+            const std::vector<size_t>& nodes = pair.second;
+            bool found = false;
+            for (size_t ngid : nodes) {
+                if (ngid == node_gid) {
+                    found = true;
+                    break;
+                }
+            }
+            if (found) {
+                auto owner_it = elem_gid_to_rank.find(ghost_elem_gid);
+                if (owner_it != elem_gid_to_rank.end()) {
+                    rank_to_ghost_node_requests[owner_it->second].push_back(node_gid);
+                    break;
+                }
+            }
+        }
+    }
+
+    // Step 4: Build extended element list and node connectivity
+    // Owned elements: 0 to num_new_elems-1 (already have these)
+    // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1
+
+    // Create extended element-node connectivity array
+    int total_extended_elems = input_mesh.num_elems + input_mesh.num_ghost_elems;
+    std::vector<std::vector<int>> extended_nodes_in_elem(total_extended_elems);
+
+    // Copy owned element connectivity (convert to extended node LIDs)
+    for (int lid = 0; lid < input_mesh.num_elems; lid++) {
+        extended_nodes_in_elem[lid].reserve(nodes_per_elem);
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = input_mesh.nodes_in_elem.host(lid, j);
+            size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid);
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[lid].push_back(ext_lid);
+        }
+    }
+
+    // Add ghost element connectivity (map ghost node GIDs to extended node LIDs)
+    int ghost_elem_ext_lid = input_mesh.num_elems;
+    std::vector<size_t> ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end());
+    std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end());
+
+    for (size_t ghost_gid : ghost_elem_gids_ordered) {
+        auto it = ghost_elem_to_nodes.find(ghost_gid);
+        if (it == ghost_elem_to_nodes.end()) continue;
+        
+        extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem);
+        for (size_t node_gid : it->second) {
+            int ext_lid = node_gid_to_extended_lid[node_gid];
+            extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid);
+        }
+        ghost_elem_ext_lid++;
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    // Sequential rank-wise printing of extended mesh structure info
+    if(print_info) {
+        for (int r = 0; r < world_size; r++) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (rank == r) {
+                std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned elements: " << input_mesh.num_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost elements: " << ghost_elem_gids.size() << std::endl;
+                std::cout << "[rank " << rank << "]   - Total extended elements: " << total_extended_elems << std::endl;
+                std::cout << "[rank " << rank << "]   - Owned nodes: " << input_mesh.num_nodes << std::endl;
+                std::cout << "[rank " << rank << "]   - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl;
+                std::cout << "[rank " << rank << "]   - Total extended nodes: " << total_extended_nodes << std::endl;
+                std::cout << std::flush;
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+    }
+    // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements
+    // Each element's nodes are stored using extended local node IDs (0-based, contiguous)
+
+    // Build reverse maps: extended_lid -> gid for nodes and elements
+    std::vector<size_t> extended_lid_to_node_gid(total_extended_nodes);
+    for (const auto& pair : node_gid_to_extended_lid) {
+        extended_lid_to_node_gid[pair.second] = pair.first;
+    }
+
+    // Build extended element GID list: owned first, then ghost
+    std::vector<size_t> extended_lid_to_elem_gid(total_extended_elems);
+
+    // Owned elements
+    for (int i = 0; i < input_mesh.num_elems; i++) {
+        extended_lid_to_elem_gid[i] = input_mesh.local_to_global_elem_mapping.host(i);
+    }
+
+    // Ghost elements (in sorted order)
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        extended_lid_to_elem_gid[input_mesh.num_elems + i] = ghost_elem_gids_ordered[i];
+    }
+
+    // Build array: for each ghost element, store which rank owns it (where to receive data from)
+    std::vector<int> ghost_elem_owner_ranks(ghost_elem_gids_ordered.size());
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        size_t ghost_gid = ghost_elem_gids_ordered[i];
+        auto it = elem_gid_to_rank.find(ghost_gid);
+        if (it != elem_gid_to_rank.end()) {
+            ghost_elem_owner_ranks[i] = it->second;
+        } else {
+            std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid 
+                    << " not found in elem_gid_to_rank map!" << std::endl;
+            ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator
+        }
+    }
+
+    // Create a std::set of all the ranks this rank will receive data from
+    std::set<int> ghost_elem_receive_ranks;
+    for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) {
+        ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]);
+    }
+
+    // ****************************************************************************************** 
+    //     Build the final partitioned mesh
+    // ****************************************************************************************** 
+
+
+    output_mesh.initialize_nodes(total_extended_nodes);
+    output_mesh.initialize_elems(total_extended_elems, 3);
+    output_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(total_extended_nodes);
+    output_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(total_extended_elems);
+    for (int i = 0; i < total_extended_nodes; i++) {
+        output_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i];
+    }
+    for (int i = 0; i < total_extended_elems; i++) {
+        output_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i];
+    }
+    output_mesh.local_to_global_node_mapping.update_device();
+    output_mesh.local_to_global_elem_mapping.update_device();
+
+    output_mesh.num_ghost_elems = ghost_elem_gids.size();
+    output_mesh.num_ghost_nodes = ghost_only_nodes.size();
+
+    output_mesh.num_owned_elems = input_mesh.num_elems;
+    output_mesh.num_owned_nodes = input_mesh.num_nodes;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    // rebuild the local element-node connectivity using the local node ids
+    // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly
+    for(int i = 0; i < total_extended_elems; i++) {
+        for(int j = 0; j < nodes_per_elem; j++) {
+            output_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j];
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    output_mesh.nodes_in_elem.update_device();
+    output_mesh.build_connectivity();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Finished building final mesh structure" << std::endl;
+
+
+    // ****************************************************************************************** 
+    //     Build the final nodes that include ghost
+    // ****************************************************************************************** 
+
+
+    output_node.initialize(total_extended_nodes, num_dim, {node_state::coords}, node_communication_plan);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // The goal here is to populate output_node.coords using globally gathered ghost node coordinates,
+    // since input_node does not contain ghost node coordinates.
+    //
+    // Each rank will:
+    //  1. Gather coordinates of its owned nodes (from input_node).
+    //  2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs
+    //     into a structure mapping global ID -> coordinate.
+    //  3. Use this map to fill output_node.coords.
+
+    // 1. Build list of all global node IDs needed on this rank (owned + ghosts)
+    std::vector<size_t> all_needed_node_gids(total_extended_nodes);
+    for (int i = 0; i < total_extended_nodes; i++) {
+        all_needed_node_gids[i] = output_mesh.local_to_global_node_mapping.host(i);
+    }
+
+    // 2. Build owned node GIDs and their coordinates
+    std::vector<size_t> owned_gids(output_mesh.num_owned_nodes);
+    for (int i = 0; i < output_mesh.num_owned_nodes; i++)
+        owned_gids[i] = output_mesh.local_to_global_node_mapping.host(i);
+
+    // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank)
+    //    so we can distribute the needed coordinate data.
+    // The easiest is to Allgather everyone's "owned_gids" and coords
+
+    int local_owned_count = static_cast<int>(owned_gids.size());
+    std::vector<int> owned_counts(world_size, 0);
+    if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1
+
+    // a) Gather counts
+    owned_counts.resize(world_size, 0);
+    MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // b) Displacements and total
+    std::vector<int> owned_displs(world_size,0);
+    int total_owned = 0;
+    for (int r = 0; r < world_size; r++) {
+        owned_displs[r] = total_owned;
+        total_owned += owned_counts[r];
+    }
+
+    // c) Global GIDs (size: total_owned)
+    std::vector<size_t> all_owned_gids(total_owned);
+    MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG,
+                all_owned_gids.data(), owned_counts.data(), owned_displs.data(),
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+
+    // Map node gid -> owning rank
+    std::unordered_map<size_t, int> node_gid_to_owner_rank;
+    int owner_offset = 0;
+    for (int r = 0; r < world_size; r++) {
+        for (int i = 0; i < owned_counts[r]; i++) {
+            node_gid_to_owner_rank[all_owned_gids[owner_offset + i]] = r;
+        }
+        owner_offset += owned_counts[r];
+    }
+
+
+    // d) Global coords (size: total_owned x 3)
+    std::vector<double> owned_coords_send(num_dim*local_owned_count, 0.0);
+    for (int i = 0; i < local_owned_count; i++) {
+        for(int dim = 0; dim < num_dim; dim++){
+            owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim);
+        }
+    }
+    std::vector<double> all_owned_coords(num_dim * total_owned, 0.0);
+
+    // Create coordinate-specific counts and displacements (in units of doubles, not nodes)
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout << " Getting coord_counts" << std::endl;
+
+    std::vector<int> coord_counts(world_size);
+    std::vector<int> coord_displs(world_size);
+    for (int r = 0; r < world_size; r++) {
+        coord_counts[r] = num_dim * owned_counts[r];  // Each node has num_dim doubles
+        coord_displs[r] = num_dim * owned_displs[r];  // Displacement in doubles
+    }
+
+    MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE,
+                all_owned_coords.data(), coord_counts.data(), coord_displs.data(),
+                MPI_DOUBLE, MPI_COMM_WORLD);
+
+    // e) Build map: gid -> coord[3]
+    std::unordered_map<size_t, std::vector<double>> gid_to_coord;
+    for (int i = 0; i < total_owned; i++) {
+        std::vector<double> xyz(num_dim);  // size is runtime-dependent
+        for (int dim = 0; dim < num_dim; dim++) {
+            xyz[dim] = all_owned_coords[num_dim * i + dim];
+        }
+        gid_to_coord[all_owned_gids[i]] = std::move(xyz);
+    }
+
+    // 4. Finally, fill output_node.coords with correct coordinates.
+    for (int i = 0; i < total_extended_nodes; i++) {
+        size_t gid = output_mesh.local_to_global_node_mapping.host(i);
+        auto it = gid_to_coord.find(gid);
+        if (it != gid_to_coord.end()) {
+            for (int dim = 0; dim < num_dim; dim++) {
+                output_node.coords.host(i,dim) = it->second[dim];
+            }
+        } else {
+            // Could happen if there's a bug: fill with zeros for safety
+            for (int dim = 0; dim < num_dim; dim++) {
+                output_node.coords.host(i,dim) = 0.0;
+            }
+        }
+    }
+    output_node.coords.update_device();
+
+
+    // --------------------------------------------------------------------------------------
+    // Build the send patterns for elements
+    // Build reverse map via global IDs: for each local element gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost element GIDs.
+    // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+    std::vector<std::vector<std::pair<int, size_t>>> boundary_elem_targets(output_mesh.num_owned_elems);
+
+    // Prepare local ghost list as vector
+    std::vector<size_t> ghost_gids_vec;
+    ghost_gids_vec.reserve(output_mesh.num_ghost_elems);
+    for (int i = 0; i < output_mesh.num_ghost_elems; i++) {
+        ghost_gids_vec.push_back(output_mesh.local_to_global_elem_mapping.host(output_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping
+    }
+
+    // Exchange counts
+    std::vector<int> ghost_counts(world_size, 0);
+    int local_ghost_count = output_mesh.num_ghost_elems;
+    MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    // Displacements and recv buffer
+    std::vector<int> ghost_displs(world_size, 0);
+    int total_ghosts = 0;
+    for (int r = 0; r < world_size; r++) {
+        ghost_displs[r] = total_ghosts;
+        total_ghosts += ghost_counts[r];
+    }
+    std::vector<size_t> all_ghost_gids(total_ghosts);
+
+    // Gather ghost gids
+    MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG,
+                all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(),
+                MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+
+
+    // Build map gid -> ranks that ghost it
+    std::unordered_map<size_t, std::vector<int>> gid_to_ghosting_ranks;
+    gid_to_ghosting_ranks.reserve(static_cast<size_t>(total_ghosts));
+    for (int r = 0; r < world_size; r++) {
+        int cnt = ghost_counts[r];
+        int off = ghost_displs[r];
+        for (int i = 0; i < cnt; i++) {
+            size_t g = all_ghost_gids[off + i];
+            gid_to_ghosting_ranks[g].push_back(r);
+        }
+    }
+
+    // For each local element, list destinations: ranks that ghost our gid
+    for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) {
+        size_t local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid);
+        auto it = gid_to_ghosting_ranks.find(local_elem_gid);
+        if (it == gid_to_ghosting_ranks.end()) continue;
+        const std::vector<int> &dest_ranks = it->second;
+        for (int rr : dest_ranks) {
+            if (rr == rank) continue;
+            boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid));
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<"After boundary_elem_targets"<<std::endl;
+
+    // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks)
+    std::vector<int> boundary_elem_local_ids;
+    std::vector<std::vector<int>> boundary_to_ghost_ranks;  // ragged array dimensions (num_boundary_elems, num_ghost_ranks)
+
+    std::set<int> ghost_comm_ranks; // set of ranks that this rank communicates with
+
+
+    for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) {
+
+        int local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid);
+        if (boundary_elem_targets[elem_lid].empty()) 
+        {
+            continue;
+        }
+        else
+        {
+            // Fill in vector of boundary local_ids
+            boundary_elem_local_ids.push_back(elem_lid);
+            std::vector<int> ghost_ranks_for_this_boundary_elem;
+            for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                ghost_ranks_for_this_boundary_elem.push_back(pr.first);
+                ghost_comm_ranks.insert(pr.first);
+            }
+            boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem);
+        }
+    }
+
+    int num_ghost_comm_ranks = ghost_comm_ranks.size();
+    std::vector<int> ghost_comm_ranks_vec(num_ghost_comm_ranks);
+    int i = 0;
+    for (const auto &r : ghost_comm_ranks) {
+        ghost_comm_ranks_vec[i] = r;
+        i++;
+    }
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    output_mesh.num_boundary_elems = boundary_elem_local_ids.size();
+    output_mesh.boundary_elem_local_ids = DCArrayKokkos<size_t>(output_mesh.num_boundary_elems, "boundary_elem_local_ids");
+    for (int i = 0; i < output_mesh.num_boundary_elems; i++) {
+        output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i];
+    }
+    output_mesh.boundary_elem_local_ids.update_device();
+
+    print_info = false;
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    std::map<int, std::set<size_t>> node_set_to_send_by_rank;
+
+    // For each owned element that will be ghosted on other ranks,
+    // collect the nodes that need to be sent to those ranks
+    // boundary_elem_targets[elem_lid] contains pairs (rank, elem_gid) for ranks that ghost this element
+    for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) {
+        // Get ranks that will ghost this element
+        for (const auto& pair : boundary_elem_targets[elem_lid]) {
+            int ghosting_rank = pair.first;
+            
+            // For each node in this element
+            for (int j = 0; j < nodes_per_elem; j++) {
+                size_t node_lid = input_mesh.nodes_in_elem.host(elem_lid, j);
+                size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid);
+                
+                // Only send nodes that are NOT shared (not on MPI rank boundary)
+                // Shared nodes are already known to both ranks
+                if (shared_nodes_on_ranks[ghosting_rank].find(node_gid) == shared_nodes_on_ranks[ghosting_rank].end()) { // WARNING: THIS SHOULD BE MOFIFIED TO ONLY FILTER SHARED NODES WITH THIS SPECIFIC RANK
+                    node_set_to_send_by_rank[ghosting_rank].insert(node_gid);
+                }
+            }
+        }
+    }
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    std::map<int, std::vector<int>> nodes_to_send_by_rank;  // rank -> list of global node indices
+
+    // Copy the node_set_to_send_by_rank map to nodes_to_send_by_rank
+    for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) {
+        for (size_t node_gid : node_gids) {
+            nodes_to_send_by_rank[dest_rank].push_back(node_gid);
+        }
+    }
+
+    // Initialize graph comms for elements    
+    // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator
+    // that efficiently represents the communication pattern between ranks.
+    // This allows MPI to optimize communication based on the actual connectivity pattern.
+    
+    
+    // ---------- Prepare INCOMING edges (sources) ----------
+    // indegree: Number of ranks from which this rank will RECEIVE data
+    // These are the ranks that own elements which are ghosted on this rank
+    std::vector<int> ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), 
+                                                    ghost_elem_receive_ranks.end());
+    // The number of ranks from which this rank will receive data (incoming neighbors)
+    int elem_indegree = static_cast<int>(ghost_elem_receive_ranks_vec.size());
+    
+    // sources: Array of source rank IDs (ranks we receive from)
+    // Each element corresponds to a rank that owns elements we ghost
+    int* sources = (elem_indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+
+    
+    // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    // int* sourceweights = MPI_UNWEIGHTED;
+    
+    // ---------- Prepare OUTGOING edges (destinations) ----------
+    // outdegree: Number of ranks to which this rank will SEND data
+    // These are the ranks that ghost elements owned by this rank
+    int outdegree = num_ghost_comm_ranks;
+    
+    // destinations: Array of destination rank IDs (ranks we send to)
+    // Each element corresponds to a rank that ghosts our owned elements
+    int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED;
+
+    // Initialize the graph communicator for element communication
+    element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), elem_indegree, ghost_elem_receive_ranks_vec.data());
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    // Optional: Verify the graph communicator was created successfully
+    // if(print_info) element_communication_plan.verify_graph_communicator();
+
+
+    // Initialize graph comms for nodes    
+    // ---------- Prepare INCOMING edges (sources) ----------
+    // indegree: Number of ranks from which this rank will RECEIVE data
+    // These are the ranks that own nodes which are ghosted on this rank
+    int node_indegree = static_cast<int>(ghost_node_receive_ranks.size());
+    int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED;
+    
+    // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    //int* node_sourceweights = MPI_UNWEIGHTED;   
+
+    // ---------- Prepare OUTGOING edges (destinations) ----------
+    // outdegree: Number of ranks to which this rank will SEND data
+    // These are the ranks that ghost nodes owned by this rank
+    int node_outdegree = static_cast<int>(ghost_node_send_ranks.size());
+    int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED;
+
+    // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
+    // int* node_destinationweights = MPI_UNWEIGHTED;
+
+    // Initialize the graph communicator for node communication
+    node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout<<"After node graph communicator"<<std::endl;
+
+    // ****************************************************************************************** 
+    //     Build send counts and displacements for element communication
+    // ****************************************************************************************** 
+
+    // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ==========
+    // For MPI_Neighbor_alltoallv with graph communicator:
+    //   - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i])
+    //   - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor
+    
+    // std::vector<int> elem_sendcounts(element_communication_plan.num_send_ranks, 0);
+    // std::vector<int> elem_sdispls(element_communication_plan.num_send_ranks, 0);
+    
+    // Count how many boundary elements go to each destination rank
+    // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element
+    std::map<int, std::vector<int>> elems_to_send_by_rank;  // rank -> list of boundary element local IDs
+    
+    for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) {
+        if (!boundary_elem_targets[elem_lid].empty()) {
+            for (const auto &pr : boundary_elem_targets[elem_lid]) {
+                int dest_rank = pr.first;
+                elems_to_send_by_rank[dest_rank].push_back(elem_lid);
+            }
+        }
+    }
+
+    // Serialize into a DRaggedRightArrayKokkos
+    DCArrayKokkos<size_t> strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send");
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
+        strides_array.host(i) = elems_to_send_by_rank[dest_rank].size();
+    }
+    strides_array.update_device();
+    DRaggedRightArrayKokkos<int> elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank");
+
+    // Fill in the data
+    for (int i = 0; i < element_communication_plan.num_send_ranks; i++) {
+        int dest_rank = element_communication_plan.send_rank_ids.host(i);
+        for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) {
+            elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j];
+        }
+    }
+    elems_to_send_by_rank_rr.update_device();
+
+    
+    // Count how many ghost elements come from each source rank
+    // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element
+    std::map<int, std::vector<int>> elems_to_recv_by_rank;  // rank -> list of ghost element indices
+    
+    for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) {
+        int source_rank = ghost_elem_owner_ranks[i];
+        int ghost_elem_local_id = output_mesh.num_owned_elems + i;
+        elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id);
+    }
+
+    // ========== Serialize into a DRaggedRightArrayKokkos ==========
+    DCArrayKokkos<size_t> elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array");
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
+        elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size();
+       
+    }
+    elem_recv_strides_array.update_device();
+    DRaggedRightArrayKokkos<int> elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank");
+    // Fill in the data
+    for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) {
+        int source_rank = element_communication_plan.recv_rank_ids.host(i);
+        for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) {
+            elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j];
+        }
+    }
+    elems_to_recv_by_rank_rr.update_device();
+    MATAR_FENCE();
+    element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // --------------------------------------------------------------------------------------
+    // Build the send pattern for nodes
+    // --------------------------------------------------------------------------------------
+    // Build reverse map via global IDs: for each local node gid, find ranks that ghost it.
+    // Steps:
+    // 1) Each rank contributes its ghost node GIDs.
+    // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it].
+    // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets.
+    // --------------------------------------------------------------------------------------
+
+    // Serialize into a DRaggedRightArrayKokkos
+    DCArrayKokkos<size_t> node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array");
+    for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
+        int dest_rank = node_communication_plan.send_rank_ids.host(i);
+        node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size();
+    }
+    node_send_strides_array.update_device();
+    DRaggedRightArrayKokkos<int> nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank");
+
+    // Fill in the data
+    for (int i = 0; i < node_communication_plan.num_send_ranks; i++) {
+        int dest_rank = node_communication_plan.send_rank_ids.host(i);
+        for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) {
+            int node_gid = nodes_to_send_by_rank[dest_rank][j];
+            int node_lid = node_gid_to_extended_lid[node_gid];
+            nodes_to_send_by_rank_rr.host(i, j) = node_lid;
+        }
+    }
+    nodes_to_send_by_rank_rr.update_device();
+
+    // For each ghost element, determine which nodes need to be received from the owning rank
+    // Build the receive list based on ghost element nodes, not on ghost_node_gids
+    // This ensures we receive all nodes needed by ghost elements
+    std::map<int, std::set<size_t>> node_set_to_recv_by_rank;  // rank -> set of node GIDs to receive
+    
+    for (int i = 0; i < output_mesh.num_ghost_elems; i++) {
+        int ghost_elem_lid = output_mesh.num_owned_elems + i;
+        size_t ghost_elem_gid = output_mesh.local_to_global_elem_mapping.host(ghost_elem_lid);
+        int owning_rank = elem_gid_to_rank.at(ghost_elem_gid);
+        
+        // Collect all nodes in this ghost element
+        for (int j = 0; j < nodes_per_elem; j++) {
+            size_t node_lid = output_mesh.nodes_in_elem.host(ghost_elem_lid, j);
+            size_t node_gid = output_mesh.local_to_global_node_mapping.host(node_lid);
+            
+            // Only receive nodes that:
+            // 1. We don't own (not in local_node_gids)
+            // 2. Are NOT shared (not on MPI rank boundary)
+            // Shared nodes are already known to both ranks via element connectivity
+            if (local_node_gids.find(node_gid) == local_node_gids.end() && 
+                shared_nodes_on_ranks[owning_rank].find(node_gid) == shared_nodes_on_ranks[owning_rank].end()) {
+                node_set_to_recv_by_rank[owning_rank].insert(node_gid);
+            }
+        }
+    }
+    
+    // Convert node GIDs to local indices and build nodes_to_recv_by_rank
+    std::map<int, std::vector<int>> nodes_to_recv_by_rank;  // rank -> list of ghost node local indices
+    std::map<size_t, int> node_gid_to_ghost_lid;  // map ghost node GID to its local index in output_mesh
+    
+    // Build the GID->local index mapping for ALL ghost nodes in output_mesh
+    // Ghost nodes are those with local IDs >= num_owned_nodes
+    for (int i = output_mesh.num_owned_nodes; i < output_mesh.num_nodes; i++) {
+        size_t node_gid = output_mesh.local_to_global_node_mapping.host(i);
+        node_gid_to_ghost_lid[node_gid] = i;
+    }
+    
+    // Now convert the GID sets to local index vectors
+    for (const auto& pair : node_set_to_recv_by_rank) {
+        int source_rank = pair.first;
+        const std::set<size_t>& node_gids = pair.second;
+        
+        for (size_t node_gid : node_gids) {
+            auto it = node_gid_to_ghost_lid.find(node_gid);
+            if (it != node_gid_to_ghost_lid.end()) {
+                nodes_to_recv_by_rank[source_rank].push_back(it->second);
+            }
+        }
+    }
+    
+    // Serialize into a DRaggedRightArrayKokkos
+    DCArrayKokkos<size_t> nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array");
+    for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
+        int source_rank = node_communication_plan.recv_rank_ids.host(i);
+        nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size();
+    }
+    nodes_recv_strides_array.update_device();
+    DRaggedRightArrayKokkos<int> nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank");
+    // Fill in the data
+    for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) {
+        int source_rank = node_communication_plan.recv_rank_ids.host(i);
+        for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) {
+            size_t node_gid = nodes_to_recv_by_rank[source_rank][j];
+            size_t local_id = node_gid_to_extended_lid[node_gid];
+
+            nodes_to_recv_by_rank_rr.host(i, j) = nodes_to_recv_by_rank[source_rank][j];
+        }
+    }
+    nodes_to_recv_by_rank_rr.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // node_communication_plan.verify_send_recv();
+
+}
+
+
+/**
+ * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh.
+ *
+ * This function performs parallel mesh partitioning using a two-stage approach:
+ *   1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks).
+ *   2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity.
+ *
+ * The partitioned mesh, nodal data, and associated connectivity/gauss point information
+ * are distributed among MPI ranks as a result. The procedure ensures that each rank receives
+ * its assigned portion of the mesh and associated data in the final (target) decomposition.
+ *
+ * @param initial_mesh[in]  The input (global) mesh, present on rank 0 or all ranks at start.
+ * @param final_mesh[out]   The mesh assigned to this rank after PT-Scotch decomposition.
+ * @param initial_node[in]  Nodal data for the input (global) mesh; must match initial_mesh.
+ * @param final_node[out]   Nodal data for this rank after decomposition (corresponds to final_mesh).
+ * @param gauss_point[out]  Gauss point data structure, filled out for this rank's mesh.
+ * @param world_size[in]    Number of MPI ranks in use (the total number of partitions).
+ * @param rank[in]          This process's MPI rank ID.
+ *
+ * Internals:
+ * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition.
+ * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout.
+ * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information,
+ *   are managed and exchanged across ranks.
+ * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition.
+ */
+
+void partition_mesh(
+    Mesh_t& initial_mesh,
+    Mesh_t& final_mesh,
+    node_t& initial_node,
+    node_t& final_node,
+    GaussPoint_t& gauss_point,
+    int world_size,
+    int rank){
+
+    bool print_info = false;
+    // bool print_vtk = false;
+
+    int num_dim = initial_mesh.num_dims;
+
+    // Create mesh, gauss points, and node data structures on each rank
+    // This is the initial partitioned mesh
+    Mesh_t naive_mesh;
+    node_t naive_node;
+
+    // Mesh partitioned by pt-scotch, not including ghost
+    Mesh_t intermediate_mesh; 
+    node_t intermediate_node;
+
+    // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh
+    CArrayDual<int> elems_in_elem_on_rank;
+    CArrayDual<int> num_elems_in_elem_per_rank;
+
+
+    // Perform the naive partitioning of the mesh
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "Performing the naive partitioning of the mesh" << std::endl;
+    naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "Begin repartitioning using PT-Scotch" << std::endl;
+
+    /**********************************************************************************
+     * Build PT-Scotch distributed graph representation of the mesh for repartitioning *
+     **********************************************************************************
+     *
+     * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch
+     * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges
+     * correspond to mesh-neighbor relationships (i.e., elements that share a face or are
+     * otherwise neighbors per your mesh definition).
+     *
+     * We use the compact CSR (Compressed Sparse Row) representation, passing only the
+     * essential information required by PT-Scotch.
+     * 
+     * Variables and structures used:
+     *   - SCOTCH_Dgraph dgraph:
+     *       The distributed graph instance managed by PT-Scotch. Each MPI rank creates
+     *       and fills in its portion of the global graph.
+     * 
+     *   - const SCOTCH_Num baseval:
+     *       The base value for vertex and edge numbering. Set to 0 for C-style zero-based
+     *       arrays. Always use 0 unless you are using Fortran style 1-based arrays.
+     * 
+     *   - const SCOTCH_Num vertlocnbr:
+     *       The *number of local vertices* (mesh elements) defined on this MPI rank.
+     *       In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify
+     *       its own local vertex count.
+     *
+     *   - const SCOTCH_Num vertlocmax:
+     *       The *maximum number of local vertices* that could be stored (capacity). We
+     *       allocate with no unused holes, so vertlocmax = vertlocnbr.
+     *
+     *   - std::vector<SCOTCH_Num> vertloctab:
+     *       CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i]
+     *       gives the index in edgeloctab where the neighbor list of vertex i begins.
+     *       PT-Scotch expects this array to be of size vertlocnbr+1, where the difference
+     *       vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i.
+     *
+     *   - std::vector<SCOTCH_Num> edgeloctab:
+     *       CSR array [variable size]: a flattened list of *neighboring element global IDs*,
+     *       in no particular order. For vertex i, its neighbors are located at
+     *       edgeloctab[vertloctab[i]...vertloctab[i+1]-1].
+     *       In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to
+     *       recognize edges both within and across ranks.
+     *
+     *   - std::map<int, size_t> elem_gid_to_offset:
+     *       Helper map: For a given element global ID, gives the starting offset in 
+     *       the flattened neighbor array (elems_in_elem_on_rank) where this element's
+     *       list of neighbor GIDs begins. This allows efficient neighbor list lookup.
+     *
+     *   - (other arrays used, from mesh setup and communication phase)
+     *       - elements_on_rank: vector of global element IDs owned by this rank.
+     *       - num_elements_on_rank: number of owned elements.
+     *       - num_elems_in_elem_per_rank: array, for each owned element, how many
+     *         neighbors it has.
+     *       - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements.
+     *
+    **********************************************************************************/
+
+    // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank ---
+    SCOTCH_Dgraph dgraph;
+    if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n";
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    // Set base value for numbering (0 for C-style arrays)
+    const SCOTCH_Num baseval = 0;
+
+    // vertlocnbr: Number of elements (vertices) that are local to this MPI rank
+    const SCOTCH_Num vertlocnbr = static_cast<SCOTCH_Num>(naive_mesh.num_elems);
+
+    // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr)
+    const SCOTCH_Num vertlocmax = vertlocnbr;
+
+    // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) ---
+    // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins
+    std::vector<SCOTCH_Num> vertloctab(vertlocnbr + 1);
+
+    // edgeloctab: flat array of neighbor global IDs for all local elements, built in order
+    std::vector<SCOTCH_Num> edgeloctab;
+    // edgeloctab holds the flattened list of all neighbors (edges) for all local elements,
+    // in a compact CSR (Compressed Sparse Row) format expected by PT-Scotch. Each entry is a global element ID
+    // of a neighbor. The edgeloctab array is built incrementally with one entry per element neighbor edge,
+    // so we reserve its capacity up front for efficiency.
+    //
+    // Heuristic: For unstructured 3D hexahedral meshes, a single element can have significantly more neighbors 
+    // than in 2D cases. In a fully structured 3D grid, each hexahedral element can have up to 26 neighbors 
+    // (since it may touch all surrounding elements along all axes). In unstructured grids, it's possible for some 
+    // elements to have even more neighbors due to mesh irregularities and refinements. 
+    // 
+    // For most practical unstructured hexahedral meshes, values in the low 20s are common, but extreme cases 
+    // (e.g., high-order connectivity, pathological splits, or meshes with "hanging nodes") may see higher counts. 
+    // Using vertlocnbr * 26 as an upper limit is a reasonable estimate for fully connected (structured) cases, 
+    // but consider increasing this if working with highly unstructured or pathological meshes. For safety and 
+    // to avoid repeated reallocations during construction, we use 26 here as a conservative guess.
+    edgeloctab.reserve(vertlocnbr * 26);
+
+    // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs)
+    // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array.
+    std::map<int, size_t> elem_gid_to_offset;
+    size_t current_offset = 0;
+    for (size_t k = 0; k < naive_mesh.num_elems; k++) {
+        int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
+        elem_gid_to_offset[elem_gid_on_rank] = current_offset;
+        current_offset += num_elems_in_elem_per_rank.host(k); 
+    }
+
+    // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element ---
+    SCOTCH_Num offset = 0; // running count of edges encountered
+
+    for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
+
+        // Record current edge offset for vertex lid in vertloctab
+        vertloctab[lid] = offset;
+
+        // Obtain this local element's global ID (from mapping)
+        int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+
+        // Find offset in the flattened neighbor array for this element's neighbor list
+        size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid];
+
+        // For this element, find the count of its neighbors
+        // This requires finding its index in the elements_on_rank array
+        size_t idx = 0;
+        for (size_t k = 0; k < naive_mesh.num_elems; k++) {
+            int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k);
+            if (elem_gid_on_rank == elem_gid) {
+                idx = k;
+                break;
+            }
+        }
+        size_t num_nbrs = num_elems_in_elem_per_rank.host(idx);
+
+        // Append each neighbor (by its GLOBAL elem GID) to edgeloctab
+        for (size_t j = 0; j < num_nbrs; j++) {
+            size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID!
+            edgeloctab.push_back(static_cast<SCOTCH_Num>(neighbor_gid));
+            ++offset; // Increment running edge count
+        }
+    }
+
+    // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure
+    vertloctab[vertlocnbr] = offset;
+
+    // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally
+    // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint)
+    const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees)
+    const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints
+
+    // Optionally print graph structure for debugging/validation
+    if (print_info) {
+        std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr
+                  << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl;
+        std::cout << "vertloctab (CSR row offsets): ";
+        for (size_t i = 0; i <= vertlocnbr; i++) {
+            std::cout << vertloctab[i] << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "edgeloctab (first 20 neighbor GIDs): ";
+        for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) {
+            std::cout << edgeloctab[i] << " ";
+        }
+        std::cout << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /**************************************************************************
+     * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild
+     *
+     *   - PT-Scotch will use our CSR arrays. Since we use compact representation,
+     *     most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab")
+     *     can be passed as nullptr.
+     *   - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this
+     *     to discover connections across processor boundaries, so you do not have to
+     *     encode ownership or partition information yourself.
+     **************************************************************************/
+    int rc = SCOTCH_dgraphBuild(
+                &dgraph,
+                baseval,                // start index (0)
+                vertlocnbr,             // local vertex count (local elements)
+                vertlocmax,             // local vertex max (no holes)
+                vertloctab.data(),      // row offsets in edgeloctab
+                /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr)
+                /*veloloctab*/ nullptr, // vertex weights, not used
+                /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab)
+                edgelocnbr,             // local edge endpoints count
+                edgelocsiz,             // size of edge array
+                edgeloctab.data(),      // global neighbor IDs for each local node
+                /*edgegsttab*/ nullptr, // ghost edge array, not used
+                /*edloloctab*/ nullptr  // edge weights, not used
+    );
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    // Optionally, print rank summary after graph build for further validation
+    if (print_info) {
+        SCOTCH_Num vertlocnbr_out;
+        SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr);
+        std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<<std::endl;
+
+    /********************************************************
+     * Step 5: Validate the graph using SCOTCH_dgraphCheck
+     ********************************************************/
+    rc = SCOTCH_dgraphCheck(&dgraph);
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n";
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    /**************************************************************
+     * Step 6: Partition (repartition) the mesh using PT-Scotch
+     * - Each vertex (mesh element) will be assigned a part (mesh chunk).
+     * - Arch is initialized for a complete graph of world_size parts (one per rank).
+     **************************************************************/
+    // SCOTCH_Arch controls the "architecture" for partitioning: the topology
+    // (number and connectivity of parts) to which the graph will be mapped.
+    // The archdat variable encodes this. Below are common options:
+    //
+    // - SCOTCH_archCmplt(&archdat, nbparts)
+    //     * Creates a "complete graph" architecture with nbparts nodes (fully connected).
+    //       Every part is equally distant from every other part.
+    //       This is typically used when minimizing only *balance* and *edge cut*,
+    //       not considering any underlying machine topology.
+    //
+    // - SCOTCH_archHcub(&archdat, dimension)
+    //     * Hypercube architecture (rare in modern use).
+    //       Sets up a hypercube of given dimension.
+    //
+    // - SCOTCH_archTleaf / SCOTCH_archTleafX
+    //     * Tree architectures, for hierarchically structured architectures.
+    //
+    // - SCOTCH_archMesh2 / SCOTCH_archMesh3
+    //     * 2D or 3D mesh topology architectures (useful for grid/matrix machines).
+    //
+    // - SCOTCH_archBuild
+    //     * General: builds any architecture from a descriptor string.
+    //
+    // For distributed mesh partitioning to MPI ranks (where all ranks are equal),
+    // the most common and appropriate is "complete graph" (Cmplt): each part (rank)
+    // is equally reachable from any other (no communication topology bias).
+    SCOTCH_Arch archdat;        // PT-Scotch architecture structure: describes desired partition topology
+    SCOTCH_archInit(&archdat);
+    // Partition into 'world_size' equally connected parts (each MPI rank is a "node")
+    // Other topology options could be substituted above according to your needs (see docs).
+    SCOTCH_archCmplt(&archdat, static_cast<SCOTCH_Num>(world_size)); 
+
+    // ===================== PT-Scotch Strategy Selection and Documentation ======================
+    // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning.
+    // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion.
+    //
+    // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation):
+    //
+    // - SCOTCH_STRATDEFAULT:     Use the default (fast, reasonable quality) partitioning strategy.
+    //                            Useful for quick, generic partitions where quality is not critical.
+    //
+    // - SCOTCH_STRATSPEED:       Aggressively maximizes speed (at the cost of cut quality).
+    //                            For large runs or test runs where speed is more important than minimizing edgecut.
+    //
+    // - SCOTCH_STRATQUALITY:     Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance).
+    //                            Slower than the default. Use when high-quality partitioning is desired.
+    //
+    // - SCOTCH_STRATBALANCE:     Tradeoff between speed and quality for balanced workload across partitions.
+    //                            Use if load balance is more critical than cut size.
+    //
+    // Additional Options:
+    // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}").
+    // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic).
+    // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality.
+    //
+    // Example usage:
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01);
+    //      ^ quality-focused, nparts=number of parts/ranks
+    //   SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05);
+    //      ^ speed-focused, allow 5% imbalance
+    //
+    // Reference:
+    // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf
+    // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation.
+    //
+    // --------------- Set up the desired partitioning strategy here: ---------------
+    SCOTCH_Strat stratdat;      // PT-Scotch strategy object: holds partitioning options/settings
+    SCOTCH_stratInit(&stratdat);
+
+    // Select partitioning strategy for this run:
+    // Use SCOTCH_STRATQUALITY for best cut quality.
+    // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above.
+    // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio)
+    SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001);
+
+    // partloctab: output array mapping each local element (vertex) to a *target partition number*
+    // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i.
+    std::vector<SCOTCH_Num> partloctab(vertlocnbr);
+    rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data());
+    if (rc != 0) {
+        std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n";
+        SCOTCH_stratExit(&stratdat);
+        SCOTCH_archExit(&archdat);
+        SCOTCH_dgraphFree(&dgraph);
+        MPI_Abort(MPI_COMM_WORLD, rc);
+    }
+
+    // Clean up PT-Scotch strategy and architecture objects
+    SCOTCH_stratExit(&stratdat);
+    SCOTCH_archExit(&archdat);
+    
+    // Free the graph now that we have the partition assignments
+    SCOTCH_dgraphFree(&dgraph);
+
+    /***************************************************************************
+     * Step 7 (Optional): Print out the partitioning assignment per element
+     * - Each local element's local index lid and global ID (gid) are listed with the
+     *   part to which PT-Scotch has assigned them.
+     ***************************************************************************/
+    print_info = false;
+    for(int rank_id = 0; rank_id < world_size; rank_id++) {
+        if(rank_id == rank && print_info) {
+            for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) {
+                size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid);
+                std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid
+                        << " -> part=" << partloctab[lid] << "\n";
+            }
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    print_info = false;
+
+// ****************************************************************************************** 
+//     Build the intermediate mesh (without ghost nodes and elements) from the repartition
+// ****************************************************************************************** 
+
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n";
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // -------------- Phase 1: Determine elements to send to each rank --------------
+    std::vector<std::vector<int>> elems_to_send(world_size);
+    for (int lid = 0; lid < naive_mesh.num_elems; lid++) {
+        int dest = static_cast<int>(partloctab[lid]);
+        int elem_gid = static_cast<int>(naive_mesh.local_to_global_elem_mapping.host(lid));
+        elems_to_send[dest].push_back(elem_gid);
+    }
+
+    // -------------- Phase 2: Exchange element GIDs --------------
+    std::vector<int> sendcounts(world_size), recvcounts(world_size);
+    for (int r = 0; r < world_size; r++)
+        sendcounts[r] = static_cast<int>(elems_to_send[r].size());
+
+    MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Compute displacements
+    std::vector<int> sdispls(world_size), rdispls(world_size);
+    int send_total = 0, recv_total = 0;
+    for (int r = 0; r < world_size; r++) {
+        sdispls[r] = send_total;
+        rdispls[r] = recv_total;
+        send_total += sendcounts[r];
+        recv_total += recvcounts[r];
+    }
+
+
+    // Flatten send buffer
+    // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks.
+    // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning.
+    std::vector<int> send_elems;
+    send_elems.reserve(send_total);
+    for (int r = 0; r < world_size; r++)
+        send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end());
+
+    // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange.
+    // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank.
+    std::vector<int> new_elem_gids(recv_total);
+    MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT,
+                new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    // New elements owned by this rank
+    int num_new_elems = static_cast<int>(new_elem_gids.size());
+    
+    // -------------- Phase 3: Send element–node connectivity --------------
+    int nodes_per_elem = naive_mesh.num_nodes_in_elem;
+
+    // Flatten element-node connectivity by global node IDs
+    std::vector<int> conn_sendbuf;
+    for (int r = 0; r < world_size; r++) {
+        for (int elem_gid : elems_to_send[r]) {
+            // find local element lid from elem_gid
+            int lid = -1;
+            for (int i = 0; i < naive_mesh.num_elems; i++)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; }
+
+            for (int j = 0; j < nodes_per_elem; j++) {
+                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
+                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+                conn_sendbuf.push_back(node_gid);
+            }
+        }
+    }
+
+    // element-node connectivity counts (ints per dest rank)
+    std::vector<int> conn_sendcounts(world_size), conn_recvcounts(world_size);
+    for (int r = 0; r < world_size; r++)
+        conn_sendcounts[r] = sendcounts[r] * nodes_per_elem;
+
+    MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"<<std::endl;
+
+    std::vector<int> conn_sdispls(world_size), conn_rdispls(world_size);
+    int conn_send_total = 0, conn_recv_total = 0;
+    for (int r = 0; r < world_size; r++) {
+        conn_sdispls[r] = conn_send_total;
+        conn_rdispls[r] = conn_recv_total;
+        conn_send_total += conn_sendcounts[r];
+        conn_recv_total += conn_recvcounts[r];
+    }
+
+    std::vector<int> conn_recvbuf(conn_recv_total);
+    MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT,
+                conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"<<std::endl;
+
+    // -------------- Phase 4: Build new node list (unique GIDs) --------------
+    std::set<int> node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end());
+    std::vector<int> new_node_gids(node_gid_set.begin(), node_gid_set.end());
+    int num_new_nodes = static_cast<int>(new_node_gids.size());
+
+    // Build map gid→lid
+    std::unordered_map<int,int> node_gid_to_lid;
+    for (int i = 0; i < num_new_nodes; i++)
+        node_gid_to_lid[new_node_gids[i]] = i;
+
+    if (print_info)
+        std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n";
+
+
+    // -------------- Phase 5: Request node coordinates --------------
+    std::vector<double> node_coords_sendbuf;
+    for (int r = 0; r < world_size; r++) {
+        for (int gid : elems_to_send[r]) {
+            int lid = -1;
+            for (int i = 0; i < naive_mesh.num_elems; i++)
+                if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; }
+
+            for (int j = 0; j < nodes_per_elem; j++) {
+                int node_lid = naive_mesh.nodes_in_elem.host(lid, j);
+                int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid);
+
+                for(int dim = 0; dim < num_dim; dim++) {
+                    node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, dim));
+                }
+            }
+        }
+    }
+
+    // Each node is 3 doubles; same sendcounts scaling applies
+    std::vector<int> coord_sendcounts(world_size), coord_recvcounts(world_size);
+    for (int r = 0; r < world_size; r++)
+        coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3;
+
+    MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<<std::endl;
+
+    std::vector<int> coord_sdispls(world_size), coord_rdispls(world_size);
+    int coord_send_total = 0, coord_recv_total = 0;
+    for (int r = 0; r < world_size; r++) {
+        coord_sdispls[r] = coord_send_total;
+        coord_rdispls[r] = coord_recv_total;
+        coord_send_total += coord_sendcounts[r];
+        coord_recv_total += coord_recvcounts[r];
+    }
+
+    std::vector<double> coord_recvbuf(coord_recv_total);
+    MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE,
+                coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished exchanging node coordinates"<<std::endl;
+
+    // -------------- Phase 6: Build the intermediate_mesh --------------
+    intermediate_mesh.initialize_nodes(num_new_nodes);
+    intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims);
+    intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos<size_t>(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping");
+    intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos<size_t>(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping");
+
+    // Fill global mappings
+    for (int i = 0; i < num_new_nodes; i++)
+        intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i];
+    for (int i = 0; i < num_new_elems; i++)
+        intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i];
+
+    intermediate_mesh.local_to_global_node_mapping.update_device();
+    intermediate_mesh.local_to_global_elem_mapping.update_device();
+
+    // rebuild the local element-node connectivity using the local node ids
+    for(int i = 0; i < intermediate_mesh.num_elems; i++) {
+        for(int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
+            int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j];
+
+            int node_lid = -1;
+
+            // Binary search through local_to_global_node_mapping to find the equivalent local index
+            int left = 0, right = num_new_nodes - 1;
+            while (left <= right) {
+                int mid = left + (right - left) / 2;
+                size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid);
+                if (node_gid == mid_gid) {
+                    node_lid = mid;
+                    break;
+                } else if (node_gid < mid_gid) {
+                    right = mid - 1;
+                } else {
+                    left = mid + 1;
+                }
+            }
+            intermediate_mesh.nodes_in_elem.host(i, j) = node_lid;
+        }
+    }
+
+    intermediate_mesh.nodes_in_elem.update_device();
+
+    // Fill node coordinates
+    // coord_recvbuf contains coords in element-node order, but we need them in node order
+    // Build a map from node GID to coordinates
+    std::map<int, std::vector<double>> node_gid_to_coords;
+    int coord_idx = 0;
+    for (int e = 0; e < intermediate_mesh.num_elems; e++) {
+        for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) {
+            int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j];
+            if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) {
+                std::vector<double> coords(num_dim);
+                for (int d = 0; d < num_dim; d++) {
+                    coords[d] = coord_recvbuf[coord_idx * num_dim + d];
+                }
+                node_gid_to_coords[node_gid] = coords;
+            }
+            coord_idx++;
+        }
+    }
+    
+    // Now fill coordinates in node order
+    intermediate_node.initialize(num_new_nodes, num_dim, {node_state::coords});
+    for (int i = 0; i < num_new_nodes; i++) {
+        int node_gid = new_node_gids[i];
+        auto it = node_gid_to_coords.find(node_gid);
+        if (it != node_gid_to_coords.end()) {
+            for (int d = 0; d < num_dim; d++) {
+                intermediate_node.coords.host(i, d) = it->second[d];
+            }
+        }
+    }
+    intermediate_node.coords.update_device();
+
+    // Connectivity rebuild
+    intermediate_mesh.build_connectivity();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    CommunicationPlan element_communication_plan;
+    element_communication_plan.initialize(MPI_COMM_WORLD);
+    
+    CommunicationPlan node_communication_plan;
+    node_communication_plan.initialize(MPI_COMM_WORLD);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Starting the ghost element and node construction"<<std::endl;
+
+    build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank);
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(rank == 0) std::cout<<" Finished the ghost element and node construction"<<std::endl;
+    
+
+// ****************************************************************************************** 
+//     Test element communication using MPI_Neighbor_alltoallv
+// ****************************************************************************************** 
+    // Gauss points share the same communication plan as elements.
+    // This test initializes gauss point fields on owned elements and exchanges them with ghost elements.
+
+    std::vector<gauss_pt_state> gauss_pt_states = {gauss_pt_state::fields, gauss_pt_state::fields_vec};
+
+    gauss_point.initialize(final_mesh.num_elems, final_mesh.num_dims, gauss_pt_states, element_communication_plan); // , &element_communication_plan
+
+    // Initialize the gauss point fields on each rank
+    // Set owned elements to rank number, ghost elements to -1 (to verify communication)
+    for (int i = 0; i < final_mesh.num_owned_elems; i++) {
+        gauss_point.fields.host(i) = static_cast<double>(rank);
+        gauss_point.fields_vec.host(i, 0) = static_cast<double>(rank);
+        gauss_point.fields_vec.host(i, 1) = static_cast<double>(rank);
+        gauss_point.fields_vec.host(i, 2) = static_cast<double>(rank);
+    }
+    for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) {
+        gauss_point.fields.host(i) = -1.0;  // Ghost elements should be updated
+        gauss_point.fields_vec.host(i, 0) = -100.0;
+        gauss_point.fields_vec.host(i, 1) = -100.0;
+        gauss_point.fields_vec.host(i, 2) = -100.0;
+    }
+    gauss_point.fields.update_device();
+    gauss_point.fields_vec.update_device();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    gauss_point.fields.communicate();
+    gauss_point.fields_vec.communicate();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    CArrayKokkos <double> tmp(final_mesh.num_elems);
+    
+    // Loop over all elements and average the values of elements connected to that element
+    FOR_ALL(i, 0, final_mesh.num_elems, {
+        double value = 0.0;
+        for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
+            value += gauss_point.fields(final_mesh.elems_in_elem(i, j));
+        }
+        value /= final_mesh.num_elems_in_elem(i);
+
+        tmp(i) = value;
+        
+
+        value = 0.0;
+        for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) {
+            value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0);
+        }
+        value /= final_mesh.num_elems_in_elem(i);
+
+        gauss_point.fields_vec(i, 0) = value;
+        gauss_point.fields_vec(i, 1) = value;
+        gauss_point.fields_vec(i, 2) = value;
+    });
+    MATAR_FENCE();
+
+    FOR_ALL(i, 0, final_mesh.num_elems, {
+        gauss_point.fields(i) = tmp(i);
+    });
+    MATAR_FENCE();
+
+    gauss_point.fields.update_host();
+    gauss_point.fields_vec.update_host();
+
+
+
+    // Test node communication using MPI_Neighbor_alltoallv
+    std::vector<node_state> node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field};
+    final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan);
+    
+    for (int i = 0; i < final_mesh.num_owned_nodes; i++) {
+        final_node.scalar_field.host(i) = static_cast<double>(rank);
+        for(int dim = 0; dim < num_dim; dim++){
+            final_node.vector_field.host(i, dim) = static_cast<double>(rank);
+        }
+    }
+    for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) {
+        final_node.scalar_field.host(i) = -100.0;
+        for(int dim = 0; dim < num_dim; dim++){
+            final_node.vector_field.host(i, dim) = -100;
+        }
+    }
+
+    final_node.coords.update_device();
+    final_node.scalar_field.update_device();
+    final_node.vector_field.update_device();
+    MATAR_FENCE();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    node_communication_plan.verify_graph_communicator();
+
+    final_node.scalar_field.communicate();
+    final_node.vector_field.communicate();
+    
+    MATAR_FENCE();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    DCArrayKokkos <double> tmp_too(final_mesh.num_nodes);
+    for(int smooth = 0; smooth < 3; smooth++){
+        FOR_ALL(i, 0, final_mesh.num_nodes, {
+
+            double value = final_node.scalar_field(i);
+            for(int j = 0; j < final_mesh.num_nodes_in_node(i); j++){
+                value += final_node.scalar_field(final_mesh.nodes_in_node(i, j));
+            }
+            value /= final_mesh.num_nodes_in_node(i) + 1;
+            tmp_too(i) = value;
+        });
+        MATAR_FENCE();
+
+        FOR_ALL(i, 0, final_mesh.num_nodes, {
+            final_node.scalar_field(i) = tmp_too(i);
+            for(int dim = 0; dim < num_dim; dim++){
+                final_node.vector_field(i, dim) = tmp_too(i);
+            }
+        });
+        MATAR_FENCE();
+    }
+
+    final_node.scalar_field.update_host();
+
+    MATAR_FENCE();
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+#endif // DECOMP_UTILS_H
\ No newline at end of file
diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh
new file mode 100755
index 00000000..29d3f853
--- /dev/null
+++ b/examples/mesh_decomp/install_ptscotch.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Install script for Scotch and PT-Scotch
+set -e
+
+# Configuration
+LIB_DIR="lib"
+# SCOTCH_VERSION="7.0.4"
+# PTSCOTCH_VERSION="7.0.4"
+# INSTALL_PREFIX="$(pwd)/${LIB_DIR}"
+
+# echo "Installing Scotch and PT-Scotch to ${INSTALL_PREFIX}"
+
+# Create lib directory if it doesn't exist
+if [ ! -d "${LIB_DIR}" ]; then
+    mkdir -p "${LIB_DIR}"
+fi
+cd ${LIB_DIR}
+# Clone and build Scotch
+echo "Cloning Scotch..."
+if [ -d "scotch" ]; then
+    rm -rf scotch
+fi
+git clone https://gitlab.inria.fr/scotch/scotch.git
+cd scotch
+
+echo "Building Scotch..."
+mkdir build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+         -DSCOTCH_MPI=ON \
+         -DMPI_C_COMPILER=mpicc \
+         -DMPI_Fortran_COMPILER=mpifort
+make
+
+echo "Installation complete! Libraries installed in: ${INSTALL_PREFIX}"
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h
new file mode 100644
index 00000000..01ad00c6
--- /dev/null
+++ b/examples/mesh_decomp/mesh.h
@@ -0,0 +1,1502 @@
+/**********************************************************************************************
+� 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+#ifndef MESH_H
+#define MESH_H
+
+#include "matar.h"
+#include "state.h"
+#include <cmath>
+
+#define PI 3.141592653589793
+
+using namespace mtr;
+
+namespace mesh_init
+{
+// element mesh types
+enum elem_name_tag
+{
+    linear_simplex_element = 0,
+    linear_tensor_element = 1,
+    arbitrary_tensor_element = 2
+};
+
+// other enums could go here on the mesh
+} // end namespace
+
+
+/*
+==========================
+Nodal indexing convention
+==========================
+
+              K
+              ^         J
+              |        /
+              |       /
+              |      /
+      6------------------7
+     /|                 /|
+    / |                / |
+   /  |               /  |
+  /   |              /   |
+ /    |             /    |
+4------------------5     |
+|     |            |     | ----> I
+|     |            |     |
+|     |            |     |
+|     |            |     |
+|     2------------|-----3
+|    /             |    /
+|   /              |   /
+|  /               |  /
+| /                | /
+|/                 |/
+0------------------1
+
+nodes are ordered for outward normal
+patch 0: [0,4,6,2]  xi-minus dir
+patch 1: [1,3,7,5]  xi-plus  dir
+patch 2: [0,1,5,4]  eta-minus dir
+patch 3: [3,2,6,7]  eta-plus  dir
+patch 4: [0,2,3,1]  zeta-minus dir
+patch 6: [4,5,7,6]  zeta-plus  dir
+*/
+
+// sort in ascending order using bubble sort
+KOKKOS_INLINE_FUNCTION
+void bubble_sort(size_t arr[], const size_t num)
+{
+    for (size_t i = 0; i < (num - 1); i++) {
+        for (size_t j = 0; j < (num - i - 1); j++) {
+            if (arr[j] > arr[j + 1]) {
+                size_t temp = arr[j];
+                arr[j]     = arr[j + 1];
+                arr[j + 1] = temp;
+            } // end if
+        } // end for j
+    } // end for i
+} // end function
+
+struct zones_in_elem_t
+{
+    private:
+        size_t num_zones_in_elem_;
+    public:
+        zones_in_elem_t() {
+        };
+
+        zones_in_elem_t(const size_t num_zones_in_elem_inp) {
+            this->num_zones_in_elem_ = num_zones_in_elem_inp;
+        };
+
+        // return global zone index for given local zone index in an element
+        size_t  host(const size_t elem_gid, const size_t zone_lid) const
+        {
+            return elem_gid * num_zones_in_elem_ + zone_lid;
+        };
+
+        // Return the global zone ID given an element gloabl ID and a local zone ID
+        KOKKOS_INLINE_FUNCTION
+        size_t operator()(const size_t elem_gid, const size_t zone_lid) const
+        {
+            return elem_gid * num_zones_in_elem_ + zone_lid;
+        };
+};
+
+// if material points are defined strictly internal to the element.
+struct gauss_in_elem_t
+{
+    private:
+        size_t num_gauss_in_elem_;
+    public:
+        gauss_in_elem_t() {
+        };
+
+        gauss_in_elem_t(const size_t num_gauss_in_elem_inp) {
+            this->num_gauss_in_elem_ = num_gauss_in_elem_inp;
+        };
+
+        // return global gauss index for given local gauss index in an element
+        size_t  host(const size_t elem_gid, const size_t leg_gauss_lid) const
+        {
+            return elem_gid * num_gauss_in_elem_ + leg_gauss_lid;
+        };
+
+        // Return the global gauss ID given an element gloabl ID and a local gauss ID
+        KOKKOS_INLINE_FUNCTION
+        size_t operator()(const size_t elem_gid, const size_t leg_gauss_lid) const
+        {
+            return elem_gid * num_gauss_in_elem_ + leg_gauss_lid;
+        };
+};
+
+/// if material points are defined at element interfaces
+struct lobatto_in_elem_t
+{
+    private:
+        size_t num_lobatto_in_elem_;
+    public:
+        lobatto_in_elem_t() {
+        };
+
+        lobatto_in_elem_t(const size_t num_lobatto_in_elem_inp) {
+            this->num_lobatto_in_elem_ = num_lobatto_in_elem_inp;
+        };
+
+        // return global gauss index for given local gauss index in an element
+        size_t  host(const size_t elem_gid, const size_t lob_gauss_lid) const
+        {
+            return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid;
+        };
+
+        // Return the global gauss ID given an element gloabl ID and a local gauss ID
+        KOKKOS_INLINE_FUNCTION
+        size_t operator()(const size_t elem_gid, const size_t lob_gauss_lid) const
+        {
+            return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid;
+        };
+};
+
+// struct nodes_in_zone_t {
+//     private:
+//          size_t num_nodes_in_zone_;
+//     public:
+//          nodes_in_zone_t(){};
+
+//          nodes_in_zone_t(const size_t num_nodes_in_zone_inp){
+//                  this->num_nodes_in_zone_ = num_nodes_in_zone_inp;
+//          };
+
+//         // return global zone index for given local zone index in an element
+//         size_t  host(const size_t zone_gid, const size_t node_lid) const{
+//             return zone_gid*num_nodes_in_zone_ + node_lid;
+//          };
+
+//         KOKKOS_INLINE_FUNCTION
+//         size_t operator()(const size_t zone_gid, const size_t node_lid) const{
+//             return zone_gid*num_nodes_in_zone_ + node_lid;
+//         };
+// };
+
+// mesh sizes and connectivity data structures
+struct Mesh_t
+{
+    // ******* Entity Definitions **********//
+    // Element: A hexahedral volume
+    // Zone: A discretization of an element base on subdividing the element using the nodes
+    // Node: A kinematic degree of freedom
+    // Surface: The 2D surface of the element
+    // Patch: A discretization of a surface by subdividing the surface using the nodes
+    // Corner: A element-node pair
+
+    bool verbose = false;
+
+    // ---- Global Mesh Definitions ---- //
+    mesh_init::elem_name_tag elem_kind = mesh_init::linear_tensor_element; ///< The type of elements used in the mesh
+
+    size_t Pn = 1; ///< Polynomial order of kinematic space
+    size_t num_dims = 3; ///< Number of spatial dimension
+
+    // ---- Element Data Definitions ---- //
+    size_t num_elems;   ///< Number of elements in the mesh
+    size_t num_nodes_in_elem;   ///< Number of nodes in an element
+    size_t num_patches_in_elem; ///< Number of patches in an element
+    size_t num_surfs_in_elem;   ///< Number of surfaces in an element
+    size_t num_zones_in_elem;   ///< Number of zones in an element
+
+    size_t num_gauss_in_elem; ///< Number of Gauss points in an element
+    size_t num_lobatto_in_elem; ///< Number of Gauss Lobatto points in an element
+
+    DCArrayKokkos<size_t> nodes_in_elem; ///< Nodes in an element
+    CArrayKokkos<size_t> corners_in_elem; ///< Corners in an element -- this can just be a functor
+
+    RaggedRightArrayKokkos<size_t> elems_in_elem; ///< Elements connected to an element
+    CArrayKokkos<size_t> num_elems_in_elem; ///< Number of elements connected to an element
+
+    CArrayKokkos<size_t> patches_in_elem; ///< Patches in an element (including internal patches)
+    CArrayKokkos<size_t> surfs_in_elem; ///< Surfaces on an element
+
+    // CArrayKokkos <size_t> zones_in_elem; ///< Zones in an element
+    zones_in_elem_t zones_in_elem; ///< Zones in an element
+    lobatto_in_elem_t lobatto_in_elem; ///< Gauss Lobatto points in an element
+    gauss_in_elem_t gauss_in_elem; ///< Gauss points in an element
+
+    // ---- Node Data Definitions ---- //
+    size_t num_nodes; ///< Number of nodes in the mesh
+
+    RaggedRightArrayKokkos<size_t> corners_in_node; ///< Corners connected to a node
+    CArrayKokkos<size_t> num_corners_in_node;       ///< Number of corners connected to a node
+    RaggedRightArrayKokkos<size_t> elems_in_node; ///< Elements connected to a given node
+    RaggedRightArrayKokkos<size_t> nodes_in_node; ///< Nodes connected to a node along an edge
+    CArrayKokkos<size_t> num_nodes_in_node; ///< Number of nodes connected to a node along an edge
+
+    // ---- Surface Data Definitions ---- //
+    size_t num_surfs;   ///< Number of surfaces in the mesh
+    size_t num_nodes_in_surf;   ///< Number of nodes in a surface
+    size_t num_patches_in_surf; ///< Number of patches in a surface
+
+    CArrayKokkos<size_t> patches_in_surf; ///< Patches in a surface
+    CArrayKokkos<size_t> nodes_in_surf; ///< Nodes connected to a surface
+    CArrayKokkos<size_t> elems_in_surf; ///< Elements connected to a surface
+
+    // ---- Patch Data Definitions ---- //
+    size_t num_patches; ///< Number of patches in the mesh
+    size_t num_nodes_in_patch;  ///< Number of nodes in a patch
+    // size_t num_lobatto_in_patch; ///< Number of Gauss Lobatto nodes in a patch
+    // size_t num_gauss_in_patch; ///< Number of Gauss nodes in a patch
+
+    CArrayKokkos<size_t> nodes_in_patch; ///< Nodes connected to a patch
+    CArrayKokkos<size_t> elems_in_patch; ///< Elements connected to a patch
+    CArrayKokkos<size_t> surf_in_patch; ///< Surfaces connected to a patch (co-planar)
+
+    // ---- Corner Data Definitions ---- //
+    size_t num_corners; ///< Number of corners (define) in the mesh
+
+    // ---- Zone Data Definitions ---- //
+    size_t num_zones;   ///< Number of zones in the mesh
+    size_t num_nodes_in_zone; ///< Number of nodes in a zone
+
+    CArrayKokkos<size_t> nodes_in_zone; ///< Nodes defining a zone
+    // nodes_in_zone_t nodes_in_zone;
+
+    // ---- Boundary Data Definitions ---- //
+    size_t num_bdy_sets;    ///< Number of boundary sets
+    size_t num_bdy_nodes;   ///< Number of boundary nodes
+    size_t num_bdy_patches; ///< Number of boundary patches
+
+    CArrayKokkos<size_t> bdy_patches; ///< Boundary patches
+    CArrayKokkos<size_t> bdy_nodes;   ///< Boundary nodes
+
+    RaggedRightArrayKokkos<size_t> bdy_patches_in_set;  ///< Boundary patches in a boundary set
+    DCArrayKokkos<size_t> num_bdy_patches_in_set; ///< Number of boundary nodes in a set
+
+    RaggedRightArrayKokkos<size_t> bdy_nodes_in_set; ///< Boundary nodes in a boundary set
+    DCArrayKokkos<size_t> num_bdy_nodes_in_set; ///< Number of boundary nodes in a set
+
+
+    // MPI Decomposition Data Definitions ---- //
+    DCArrayKokkos<size_t> local_to_global_node_mapping; ///< Local to global node mapping
+    DCArrayKokkos<size_t> local_to_global_elem_mapping; ///< Local to global element mapping
+
+    // Element communicaiton data definitions
+    size_t num_owned_elems; ///< Number of owned elements on this rank
+    size_t num_boundary_elems; ///< Number of boundary elements on this rank (send data to neighboring MPI ranks)
+    DCArrayKokkos<size_t> boundary_elem_local_ids; ///< Local IDs of boundary elements on this rank (send data to neighboring MPI ranks)
+    size_t num_ghost_elems; ///< Number of ghost elements on this rank (receive data from neighboring MPI ranks)
+    
+    // Node communicaiton data definitions
+    size_t num_owned_nodes; ///< Number of owned nodes on this rank
+    size_t num_boundary_nodes; ///< Number of boundary nodes on this rank (send data to neighboring MPI ranks)
+    DCArrayKokkos<size_t> boundary_node_local_ids; ///< Local IDs of boundary nodes on this rank (send data to neighboring MPI ranks)
+    size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (receive data from neighboring MPI ranks)
+    
+
+
+
+
+    // initialization methods
+    void initialize_nodes(const size_t num_nodes_inp)
+    {
+        num_nodes = num_nodes_inp;
+        return;
+    }; // end method
+
+    // initialization methods
+    void initialize_elems(const size_t num_elems_inp, const size_t num_dims_inp)
+    {
+        num_dims = num_dims_inp;
+        num_nodes_in_elem = 1;
+        
+        for (int dim = 0; dim < num_dims; dim++) {
+            num_nodes_in_elem *= 2;
+        }
+        num_elems       = num_elems_inp;
+        nodes_in_elem   = DCArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem");
+        corners_in_elem = CArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.corners_in_elem");
+
+        // 1 Gauss point per element
+        num_gauss_in_elem = 1;
+
+        // 1 zone per element
+        num_zones_in_elem = 1;
+
+        gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem);
+
+        return;
+    }; // end method
+
+    // initialization method
+    void initialize_elems_Pn(const size_t num_elems_inp,
+        const size_t num_nodes_in_elem_inp,
+        const size_t num_gauss_leg_in_elem_inp,
+        const size_t num_zones_in_elem_inp,
+        const size_t num_nodes_in_zone_inp,
+        const size_t num_surfs_in_elem_inp,
+        const size_t num_dims_inp)
+    {
+        num_dims  = num_dims_inp;
+        num_elems = num_elems_inp;
+
+        num_nodes_in_elem     = num_nodes_in_elem_inp;
+        num_nodes_in_zone     = num_nodes_in_zone_inp;
+        num_gauss_in_elem = num_gauss_leg_in_elem_inp;
+        num_zones_in_elem     = num_zones_in_elem_inp;
+        num_surfs_in_elem     = num_surfs_in_elem_inp;
+
+        num_zones = num_zones_in_elem * num_elems;
+
+        nodes_in_elem    = DCArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem");
+        corners_in_elem  = CArrayKokkos<size_t>(num_elems, num_nodes_in_elem, "mesh.corners_in_elem");
+        zones_in_elem    = zones_in_elem_t(num_zones_in_elem);
+        surfs_in_elem    = CArrayKokkos<size_t>(num_elems, num_surfs_in_elem, "mesh.surfs_in_zone");
+        nodes_in_zone    = CArrayKokkos<size_t>(num_zones, num_nodes_in_zone, "mesh.nodes_in_zone");
+        gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem);
+
+        return;
+    }; // end method
+
+    // initialization methods
+    void initialize_corners(const size_t num_corners_inp)
+    {
+        num_corners = num_corners_inp;
+
+        return;
+    }; // end method
+
+    // build the corner mesh connectivity arrays
+    void build_corner_connectivity()
+    {
+        num_corners_in_node = CArrayKokkos<size_t>(num_nodes, "mesh.num_corners_in_node"); // stride sizes
+
+        // initializing the number of corners (node-cell pair) to be zero
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            num_corners_in_node(node_gid) = 0;
+        });
+
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, {
+                // get the global_id of the node
+                size_t node_gid = nodes_in_elem(elem_gid, node_lid);
+
+                // increment the number of corners attached to this point
+                num_corners_in_node(node_gid) = num_corners_in_node(node_gid) + 1;
+            });  // end FOR_ALL over nodes in element
+        } // end for elem_gid
+
+        // the stride sizes are the num_corners_in_node at the node
+        corners_in_node = RaggedRightArrayKokkos<size_t>(num_corners_in_node, "mesh.corners_in_node");
+
+        CArrayKokkos<size_t> count_saved_corners_in_node(num_nodes, "count_saved_corners_in_node");
+
+        // reset num_corners to zero
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            count_saved_corners_in_node(node_gid) = 0;
+        });
+
+        // the elems_in_elem data type
+        elems_in_node = RaggedRightArrayKokkos<size_t>(num_corners_in_node, "mesh.elems_in_node");
+
+        // populate the elements connected to a node list and corners in a node
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, {
+                // get the global_id of the node
+                size_t node_gid = nodes_in_elem(elem_gid, node_lid);
+
+                // the column index is the num corners saved
+                size_t j = count_saved_corners_in_node(node_gid);
+
+                // Save corner index to this node_gid
+                size_t corner_gid = node_lid + elem_gid * num_nodes_in_elem;  // this can be a functor
+                corners_in_node(node_gid, j) = corner_gid;
+
+                elems_in_node(node_gid, j) = elem_gid; // save the elem_gid
+
+                // Save corner index to element
+                size_t corner_lid = node_lid;
+                corners_in_elem(elem_gid, corner_lid) = corner_gid;
+
+                // increment the number of corners saved to this node_gid
+                count_saved_corners_in_node(node_gid) = count_saved_corners_in_node(node_gid) + 1;
+            });  // end FOR_ALL over nodes in element
+        } // end for elem_gid
+
+        return;
+    } // end of build_corner_connectivity
+
+    // build elem connectivity arrays
+    void build_elem_elem_connectivity()
+    {
+        // find the max number of elems around a node
+        size_t max_num_elems_in_node;
+        size_t max_num_lcl;
+        FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, {
+            // num_corners_in_node = num_elems_in_node
+            size_t max_num = num_corners_in_node(node_gid);
+
+            if (max_num > max_num_lcl) {
+                max_num_lcl = max_num;
+            }
+        }, max_num_elems_in_node); // end parallel reduction on max
+        Kokkos::fence();
+
+        // a temporary ragged array to save the elems around an elem
+        DynamicRaggedRightArrayKokkos<size_t> temp_elems_in_elem(num_nodes, num_nodes_in_elem * max_num_elems_in_node, "temp_elems_in_elem");
+
+        num_elems_in_elem = CArrayKokkos<size_t>(num_elems, "mesh.num_elems_in_elem");
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            num_elems_in_elem(elem_gid) = 0;
+        });
+        Kokkos::fence();
+
+        // find and save neighboring elem_gids of an elem
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+                // get the gid for the node
+                size_t node_id = nodes_in_elem(elem_gid, node_lid);
+
+                // loop over all elems connected to node_gid
+                for (int elem_lid = 0; elem_lid < num_corners_in_node(node_id); elem_lid++) {
+                    // get the global id for the neighboring elem
+                    size_t neighbor_elem_gid = elems_in_node(node_id, elem_lid);
+
+                    // a flag to save (=1) or not (=0)
+                    size_t save = 1;
+
+                    // a true neighbor_elem_id is not equal to elem_gid
+                    if (neighbor_elem_gid == elem_gid) {
+                        save = 0;  // don't save
+                    } // end if
+
+                    // check to see if the neighbor_elem_gid has been saved already
+                    size_t num_saved = temp_elems_in_elem.stride(elem_gid);
+                    for (size_t i = 0; i < num_saved; i++) {
+                        if (neighbor_elem_gid == temp_elems_in_elem(elem_gid, i)) {
+                            save = 0;   // don't save, it has been saved already
+                        } // end if
+                    } // end for i
+
+                    if (save == 1) {
+                        // increment the number of neighboring elements saved
+                        temp_elems_in_elem.stride(elem_gid)++;
+
+                        // save the neighboring elem_gid
+                        temp_elems_in_elem(elem_gid, num_saved) = neighbor_elem_gid;
+                    } // end if save
+                } // end for elem_lid in a node
+            }  // end for node_lid in an elem
+
+            // save the actial stride size
+            num_elems_in_elem(elem_gid) = temp_elems_in_elem.stride(elem_gid);
+        }); // end FOR_ALL elems
+        Kokkos::fence();
+
+        // compress out the extra space in the temp_elems_in_elem
+        elems_in_elem = RaggedRightArrayKokkos<size_t>(num_elems_in_elem, "mesh.elems_in_elem");
+
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            for (size_t i = 0; i < num_elems_in_elem(elem_gid); i++) {
+                elems_in_elem(elem_gid, i) = temp_elems_in_elem(elem_gid, i);
+            } // end for i
+        });  // end FOR_ALL elems
+        Kokkos::fence();
+
+        return;
+    } // end of build_elem_elem_connectivity
+
+    // build the patches
+    void build_patch_connectivity()
+    {
+        // WARNING WARNING
+        // the mesh element kind should be in the input file and set when reading mesh
+        // mesh_elem_kind = mesh_init::linear_tensor_element; // MUST BE SET
+
+        // building patches
+
+        num_nodes_in_patch = 2 * (num_dims - 1);  // 2 (2D) or 4 (3D)
+        num_surfs_in_elem  = 2 * num_dims; // 4 (2D) or 6 (3D)
+
+        // num_lobatto_in_patch = int(pow(3, num_dims-1));
+
+        // num_gauss_in_patch = 2*(num_dims-1);
+
+        size_t num_patches_in_surf;  // = Pn_order or = Pn_order*Pn_order
+
+        size_t num_1D = Pn + 1; // number of nodes in 1D
+
+        // num quad points 1D //
+        // size_t num_lob_1D = 2*Pn + 1;
+        // size_t num_1D = 2*Pn;
+
+        DCArrayKokkos<size_t> node_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_nodes_in_patch);
+
+        // DCArrayKokkos <size_t> lobatto_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_lobatto_in_patch);
+
+        // DCArrayKokkos <size_t> gauss_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_gauss_in_patch);
+
+        if (verbose) printf("Number of dimensions = %zu \n", num_dims);
+
+        if (num_dims == 3) {
+            // num_patches_in_surf = [1^2, 2^2, 3^2, 4^2, ... , Pn^2]
+
+            num_patches_in_surf = Pn * Pn;
+
+            num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem;
+
+            // nodes in a patch in the element
+            node_ordering_in_elem = DCArrayKokkos<size_t>(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem");
+
+            // lobatto_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_lobatto_in_patch);
+
+            // gauss_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_gauss_in_patch);
+
+            // printf("num_patches_in_elem = %zu \n", num_patches_in_elem);
+            // printf("num_nodes_in_patch = %zu \n", num_nodes_in_patch);
+            // printf("num_lobatto_in_patch = %zu \n", num_lobatto_in_patch);
+            // printf("num_gauss_in_patch = %zu \n", num_gauss_in_patch);
+            // printf("Number of surfaces = %zu \n", num_surfs_in_elem);
+        }
+        else {
+            num_patches_in_surf = Pn;
+
+            num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem;
+
+            // nodes in a patch in the element
+            node_ordering_in_elem = DCArrayKokkos<size_t>(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem");
+            // lobatto_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_lobatto_in_patch);
+            // gauss_ordering_in_elem = DCArrayKokkos <size_t> (num_patches_in_elem, num_gauss_in_patch);
+        } // end if dim
+
+        // On the CPU, set the node order for the patches in an element
+        // classic linear elements
+        if (elem_kind == mesh_init::linear_tensor_element) {
+            if (num_dims == 3) {
+
+                 size_t temp_node_lids[24] = { 0, 4, 6, 2,
+                                              1, 3, 7, 5,
+                                              0, 1, 5, 4,
+                                              3, 2, 6, 7,
+                                              0, 2, 3, 1,
+                                              4, 5, 7, 6 };
+
+                int count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+
+                // count = 0;
+                // elem_patch_lid = 0;
+                // for ( size_t surf_lid=0; surf_lid < num_surfs_in_elem; surf_lid++ ){
+                //     for ( size_t patch_lid=0; patch_lid < num_patches_in_surf; patch_lid++ ){
+                //         for ( size_t lobatto_lid=0; lobatto_lid < num_lobatto_in_patch; lobatto_lid++ ){
+                //             lobatto_ordering_in_elem.host( elem_patch_lid, lobatto_lid ) = temp_node_lids[count];
+                //             count++;
+                //         } // end for node_lid
+                //         elem_patch_lid ++;
+                //     } // end for patch_lid in a surface
+                // } // end for i
+            }
+            else {
+                //   J
+                //   |
+                // 3---2
+                // |   |  -- I
+                // 0---1
+                //
+                size_t temp_node_lids[8] =
+                { 0, 3,
+                  1, 2,
+                  0, 1,
+                  3, 2 };
+
+                int count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+            } // end if on dims
+        } // end of linear element iwth classic numbering
+        // -----
+        // arbitrary-order element
+        // -----
+        else if (elem_kind == mesh_init::arbitrary_tensor_element) {
+            size_t temp_node_lids[num_nodes_in_patch * num_patches_in_surf * num_surfs_in_elem];
+
+            printf("arbitrary order tensor element \n");
+
+            // arbitrary-order node ordering in patches of an element
+            if (num_dims == 3) {
+                /*
+
+                    i,j,k layout
+
+                    k  j
+                    | /
+                    |/
+                    o-->i
+
+
+                    i=0,imax
+                    o (j+1,k+1)
+                    /|
+                    (j,k+1) o o (j+1,k)
+                    |/
+                    (j,k) o
+
+                    */
+
+                int count = 0;
+
+                int i_patch, j_patch, k_patch;
+
+                // i-minus-dir patches
+
+                i_patch = 0;
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int j = 0; j < num_1D - 1; j++) {
+                        // node_lid 0 in patch
+                        // index = i + j*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        // index = i + j*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        // index = i + (j+1)*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D);
+                        count++;
+                    } // end for k
+                } // end for j
+
+                // printf("i-minus\n");
+
+                // i-plus-dir patches
+                i_patch = num_1D - 1;
+                // printf("num_1D = %zu \n", num_1D);
+                // printf("i_patch = %d \n", i_patch);
+                printf("num_nodes_in_elem %zu \n", num_nodes_in_elem);
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int j = 0; j < num_1D - 1; j++) {
+                        // node_lid 0 in patch
+                        // index = i + j*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        // index = i + (j+1)*num_1D + k*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        // index = i + j*num_1D + (k+1)*num_1D*num_1D;
+                        temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D);
+                        count++;
+                    } // end for j
+                } // end for k
+
+                // printf("i-plus\n");
+
+                /*
+
+                    i,j,k layout
+
+                    k  j
+                    | /
+                    |/
+                    o-->i
+
+
+                    j=0,jmax
+
+                    (i,,k+1) o--o (i+1,,k+1)
+                    |  |
+                    (i,,k) o--o (i+1,,k)
+
+                    */
+
+                j_patch = 0;
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D);
+                        count++;
+                    } // end for i
+                } // end for k
+
+                // printf("j-minus\n");
+
+                j_patch = num_1D - 1;
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D);
+                        count++;
+                    } // end for i
+                } // end for k
+
+                // printf("j-plus\n");
+
+                /*
+
+                    i,j,k layout
+
+                    k  j
+                    | /
+                    |/
+                    o-->i
+
+
+                    k=0,kmax
+
+                    (i,j+1) o--o (i+1,j+1)
+                    /  /
+                    (i,j) o--o (i+1,j)
+
+                    */
+
+                k_patch = 0;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D);
+                        count++;
+                    } // end for i
+                } // end for j
+                  // printf("k-minus\n");
+
+                k_patch = num_1D - 1;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    for (int i = 0; i < num_1D - 1; i++) {
+                        // node_lid 0 in patch
+                        temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 1 in patch
+                        temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 2 in patch
+                        temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D);
+                        count++;
+
+                        // node_lid 3 in patch
+                        temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D);
+                        count++;
+                    } // end for i
+                } // end for j
+
+                // printf("k-plus\n");
+
+                count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < 6; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < 4; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+            }  // end if 3D
+            //
+            else{
+                // 2D arbitrary order elements
+                int count = 0;
+                int i_patch, j_patch;
+
+                // i-minus-dir patches
+
+                i_patch = 0;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D;
+                    count++;
+
+                    temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D;
+                    count++;
+                } // end for j
+
+                // i-plus-dir patches
+                i_patch = num_1D - 1;
+                for (int j = 0; j < num_1D - 1; j++) {
+                    temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D;
+                    count++;
+
+                    temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D;
+                    count++;
+                } // end for j
+
+                j_patch = 0;
+                for (int i = 0; i < num_1D - 1; i++) {
+                    temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D);
+                    count++;
+
+                    temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D);
+                    count++;
+                } // end for i
+
+                j_patch = num_1D - 1;
+                for (int i = 0; i < num_1D - 1; i++) {
+                    temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D);
+                    count++;
+
+                    temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D);
+                    count++;
+                } // end for i
+
+                count = 0;
+                int elem_patch_lid = 0;
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                        for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                            node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count];
+                            count++;
+                        } // end for node_lid
+                        elem_patch_lid++;
+                    } // end for patch_lid in a surface
+                } // end for i
+            } // end else on dim
+
+            // build zones in high order element
+            FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+                size_t node_lids[8]; // temp storage for local node ids
+                for (int k = 0; k < num_1D - 1; k++) {
+                    for (int j = 0; j < num_1D - 1; j++) {
+                        for (int i = 0; i < num_1D - 1; i++) {
+                            node_lids[0] = i + j * (num_1D) + k * (num_1D) * (num_1D); // i,j,k
+                            node_lids[1] = i + 1 + j * (num_1D) + k * (num_1D) * (num_1D); // i+1, j, k
+                            node_lids[2] = i + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i,j+1,k
+                            node_lids[3] = i + 1 + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i+1, j+1, k
+                            node_lids[4] = i + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i, j , k+1
+                            node_lids[5] = i + 1 + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i + 1, j , k+1
+                            node_lids[6] = i + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i,j+1,k+1
+                            node_lids[7] = i + 1 + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i+1, j+1, k+1
+
+                            size_t zone_lid = i + j * (num_1D - 1) + k * (num_1D - 1) * (num_1D - 1);
+                            size_t zone_gid = zones_in_elem(elem_gid, zone_lid);
+
+                            for (int node_lid = 0; node_lid < 8; node_lid++) {
+                                // get global id for the node
+                                size_t node_gid = nodes_in_elem(elem_gid, node_lids[node_lid]);
+                                nodes_in_zone(zone_gid, node_lid) = node_gid;
+                            }
+                        } // i
+                    } // j
+                } // k
+            }); // end FOR_ALL elem_gid
+        } // end if arbitrary-order element
+        else {
+            printf("\nERROR: mesh type is not known \n");
+        } // end if
+
+        // update the device
+        node_ordering_in_elem.update_device();
+        Kokkos::fence();
+
+        if (verbose) printf("Built node ordering \n");
+
+        // for saving the hash keys of the patches and then the neighboring elem_gid
+        CArrayKokkos<int> hash_keys_in_elem(num_elems, num_patches_in_elem, num_nodes_in_patch, "hash_keys_in_elem"); // always 4 ids in 3D
+
+        // for saving the adjacent patch_lid, which is the slide_lid
+        // CArrayKokkos <size_t> neighboring_side_lids (num_elems, num_patches_in_elem);
+
+        // allocate memory for the patches in the elem
+        patches_in_elem = CArrayKokkos<size_t>(num_elems, num_patches_in_elem, "mesh.patches_in_elem");
+
+        // a temporary storage for the patch_gids that are on the mesh boundary
+        CArrayKokkos<size_t> temp_bdy_patches(num_elems * num_patches_in_elem, "temp_bdy_patches");
+
+        // step 1) calculate the hash values for each patch in the element
+        FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+            for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) {
+                size_t sorted_patch_nodes[4];  // note: cannot be allocated with num_nodes_in_patch
+
+                // first save the patch nodes
+                for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) {
+                    // get the local node index of the element for this patch and node in patch
+                    size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid);
+
+                    // get and save the global index of the node
+                    sorted_patch_nodes[patch_node_lid] = nodes_in_elem(elem_gid, node_lid);
+                }  // end for node_lid
+
+                // sort nodes from smallest to largest
+                bubble_sort(sorted_patch_nodes, num_nodes_in_patch);
+
+                // save hash_keys in the this elem
+                for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) {
+                    hash_keys_in_elem(elem_gid, patch_lid, key_lid) = sorted_patch_nodes[key_lid];  // 4 node values are keys
+                } // for
+            } // end for patch_lid
+        }); // end FOR_ALL elem_gid
+
+        DCArrayKokkos<size_t> num_values(2, "num_values");
+
+        // 8x8x8 mesh
+        // num_patches = 8*8*9*3 = 1728
+        // bdy_patches = 8*8*6 = 384
+        //
+
+        // step 2: walk around the elements and save the elem pairs that have the same hash_key
+        RUN_CLASS({
+            // serial execution on GPU
+
+            size_t patch_gid     = 0;
+            size_t bdy_patch_gid = 0;
+
+            for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                // loop over the patches in this elem
+                for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) {
+                    size_t exit = 0;
+
+                    // negative values mean the patch has not been saved
+                    if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) {
+                        // find the nighboring patch with the same hash_key
+
+                        for (size_t neighbor_elem_lid = 0; neighbor_elem_lid < num_elems_in_elem(elem_gid); neighbor_elem_lid++) {
+                            // get the neighboring element global index
+                            size_t neighbor_elem_gid = elems_in_elem(elem_gid, neighbor_elem_lid);
+
+                            for (size_t neighbor_patch_lid = 0; neighbor_patch_lid < num_patches_in_elem; neighbor_patch_lid++) {
+                                size_t save_it = 0;
+                                for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) {
+                                    if (hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, key_lid) == hash_keys_in_elem(elem_gid, patch_lid, key_lid)) {
+                                        save_it++; // if save_it == num_nodes after this loop, then it is a match
+                                    }
+                                } // end key loop
+
+                                // this hash is from the nodes on the patch
+                                if (save_it == num_nodes_in_patch) {
+                                    // make it negative, because we saved it
+                                    hash_keys_in_elem(elem_gid, patch_lid, 0) = -1;
+                                    hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, 0) = -1;
+
+                                    // save the patch_lids for the adjacent sides
+                                    // neighboring_side_lids(elem_gid, patch_lid) = neighbor_patch_lid;
+                                    // neighboring_side_lids(neighbor_elem_gid, neighbor_patch_lid) = patch_lid;
+
+                                    // save the patch_gid
+                                    patches_in_elem(elem_gid, patch_lid) = patch_gid;
+                                    patches_in_elem(neighbor_elem_gid, neighbor_patch_lid) = patch_gid;
+
+                                    patch_gid++;
+
+                                    exit = 1;
+                                    break;
+                                } // end if
+                            } // end for loop over a neighbors patch set
+
+                            if (exit == 1) {
+                                break;
+                            }
+                        } // end for loop over elem neighbors
+                    } // end if hash<0
+                } // end for patch_lid
+
+                // loop over the patches in this element again
+                // remaining positive hash key values are the boundary patches
+                for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) {
+                    if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) {
+                        hash_keys_in_elem(elem_gid, patch_lid, 0) = -1;  // make it negative, because we saved it
+
+                        // neighboring_side_lids(elem_gid, patch_lid) = patch_lid;
+
+                        patches_in_elem(elem_gid, patch_lid) = patch_gid;
+                        temp_bdy_patches(bdy_patch_gid) = patch_gid;
+
+                        patch_gid++;
+                        bdy_patch_gid++;
+                    } // end if
+                }  // end for over patch_lid
+            }  // end for over elem_gid
+
+            // the num_values is because the values passed in are const, so a const pointer is needed
+            num_values(0) = patch_gid;     // num_patches = patch_gid;
+            num_values(1) = bdy_patch_gid; // num_bdy_patches = bdy_patch_gid;
+        }); // end RUN
+        Kokkos::fence();
+
+        num_values.update_host();
+        Kokkos::fence();
+
+        num_patches     = num_values.host(0);
+        // this lines assumes num_surfs ==  num_patches, only valid for 1st order elements
+        num_surfs       = num_values.host(0);
+        num_bdy_patches = num_values.host(1);
+
+        // size_t mesh_1D = 60;
+        // size_t exact_num_patches = (mesh_1D*mesh_1D)*(mesh_1D+1)*3;
+        // size_t exact_num_bdy_patches = (mesh_1D*mesh_1D)*6;
+        // printf("num_patches = %lu, exact = %lu \n", num_patches, exact_num_patches);
+        // printf("num_bdy_patches = %lu exact = %lu \n", num_bdy_patches, exact_num_bdy_patches);
+
+        // printf("Num patches = %lu \n", num_patches);
+        // printf("Num boundary patches = %lu \n", num_bdy_patches);
+
+        elems_in_patch = CArrayKokkos<size_t>(num_patches, 2, "mesh.elems_in_patch");
+        nodes_in_patch = CArrayKokkos<size_t>(num_patches, num_nodes_in_patch, "mesh.nodes_in_patch");
+
+        // a temporary variable to help populate patch structures
+        CArrayKokkos<size_t> num_elems_in_patch_saved(num_patches, "num_elems_in_patch_saved");
+
+        // initialize the number of elems in a patch saved to zero
+        FOR_ALL_CLASS(patch_gid, 0, num_patches, {
+            num_elems_in_patch_saved(patch_gid) = 0;
+        });
+
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            FOR_ALL_CLASS(patch_lid, 0, num_patches_in_elem, {
+                size_t patch_gid = patches_in_elem(elem_gid, patch_lid);
+
+                size_t num_saved = num_elems_in_patch_saved(patch_gid);
+
+                elems_in_patch(patch_gid, num_saved) = elem_gid;
+
+                // record that an elem_gid was saved
+                num_elems_in_patch_saved(patch_gid)++;
+
+                // save the nodes on this patch
+                for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) {
+                    // get the local node index of the element for this patch and node in patch
+                    size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid);
+
+                    // get and save the global index of the node
+                    nodes_in_patch(patch_gid, patch_node_lid) = nodes_in_elem(elem_gid, node_lid);
+                }  // end for node_lid
+            }); // end FOR_ALL patch_lid
+        } // end for
+
+        // Surfaces and patches in surface
+        if (elem_kind == mesh_init::arbitrary_tensor_element) {
+            // allocate memory for the surfaces in the elem
+            surfs_in_elem = CArrayKokkos<size_t>(num_elems, num_surfs_in_elem);
+
+            // allocate memory for surface data structures
+            num_surfs = num_patches / num_patches_in_surf;
+
+            patches_in_surf = CArrayKokkos<size_t>(num_surfs, num_patches_in_surf, "mesh.patches_in_surf");
+            elems_in_surf   = CArrayKokkos<size_t>(num_surfs, 2, "mesh.elems_in_surf");
+            surf_in_patch   = CArrayKokkos<size_t>(num_patches, "mesh.surf_in_patch");
+
+            FOR_ALL_CLASS(surf_gid, 0, num_surfs, {
+                // loop over the patches in this surface
+                for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) {
+                    // get patch_gid
+                    size_t patch_gid = patch_lid + surf_gid * num_patches_in_surf;
+
+                    // save the patch_gids
+                    patches_in_surf(surf_gid, patch_lid) = patch_gid;
+
+                    // save the surface this patch belongs to
+                    surf_in_patch(patch_gid) = surf_gid;
+                } // end for
+
+                // get first patch in the surface, and populate elem surface structures
+                size_t this_patch_gid = surf_gid * num_patches_in_surf;
+
+                elems_in_surf(surf_gid, 0) = elems_in_patch(this_patch_gid, 0);  // elem_gid0
+                elems_in_surf(surf_gid, 1) = elems_in_patch(this_patch_gid, 1);  // elem_gid1
+            }); // end FOR_ALL over surfaces
+
+            // save surfaces in elem
+            FOR_ALL_CLASS(elem_gid, 0, num_elems, {
+                for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) {
+                    // get the local patch_lid
+                    size_t patch_lid = surf_lid * num_patches_in_surf;
+
+                    // get the patch_gids in this element
+                    size_t patch_gid = patches_in_elem(elem_gid, patch_lid);
+
+                    // save the surface gid
+                    // Grab the first patch on surf and return surface_gid from surf_in_patch //
+                    surfs_in_elem(elem_gid, surf_lid) = surf_in_patch(patch_gid);
+                } // end surf_lid
+            });
+
+            DViewCArrayKokkos<size_t> surf_node_ordering_in_elem;
+
+            if (num_dims == 3) {
+                // num_1D = Pn+1
+                int    num_surface_nodes = num_surfs_in_elem * pow(num_1D, num_dims - 1);
+                size_t temp_surf_node_lids[num_surface_nodes];
+                // 2D arbitrary order elements
+                int count = 0;
+
+                for (int i_surf = 0; i_surf < 2; i_surf++) {
+                    for (int k = 0; k < num_1D; k++) {
+                        for (int j = 0; j < num_1D; j++) {
+                            // node_lid 0 in patch
+                            // index = i + j*num_1D + k*num_1D*num_1D;
+                            temp_surf_node_lids[count] = i_surf + j * num_1D + k * num_1D * num_1D;
+                            count++;
+                        } // end for k
+                    } // end for j
+                }
+
+                for (int j_surf = 0; j_surf < 2; j_surf++) {
+                    for (int k = 0; k < num_1D; k++) {
+                        for (int i = 0; i < num_1D; i++) {
+                            // node_lid 0 in patch
+                            temp_surf_node_lids[count] = i + j_surf * num_1D + k * num_1D * num_1D;
+                            count++;
+                        }
+                    }
+                }
+
+                for (int k_surf = 0; k_surf < 2; k_surf++) {
+                    for (int j = 0; j < num_1D; j++) {
+                        for (int i = 0; i < num_1D; i++) {
+                            // node_lid 0 in patch
+                            temp_surf_node_lids[count] = i + j * num_1D + k_surf * num_1D * num_1D;
+                            count++;
+                        }
+                    }
+                }
+
+                nodes_in_surf = CArrayKokkos<size_t>(num_surfs, num_1D * num_1D, "mesh.nodes_in_surf");
+
+                num_nodes_in_surf = num_1D * num_1D;
+                surf_node_ordering_in_elem = DViewCArrayKokkos<size_t>(&temp_surf_node_lids[0], num_surfs_in_elem, num_nodes_in_surf);
+                surf_node_ordering_in_elem.update_device();
+                for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                    FOR_ALL_CLASS(surf_lid, 0, num_surfs_in_elem, {
+                        int surf_gid = surfs_in_elem(elem_gid, surf_lid);
+                        for (int surf_node_lid = 0; surf_node_lid < num_nodes_in_surf; surf_node_lid++) {
+                            int node_lid = surf_node_ordering_in_elem(surf_lid, surf_node_lid);
+                            int node_gid = nodes_in_elem(elem_gid, node_lid);
+                            nodes_in_surf(surf_gid, surf_node_lid) = node_gid;
+                        } // end loop over surf_node_lid
+                    }); // end loop over FOR_ALL_CLASS
+                } // end loop over elem_gid
+            } // end 3D scope
+        } // end of high-order mesh objects
+
+        // ----------------
+
+        // allocate memory for boundary patches
+        bdy_patches = CArrayKokkos<size_t>(num_bdy_patches, "mesh.bdy_patches");
+
+        FOR_ALL_CLASS(bdy_patch_gid, 0, num_bdy_patches, {
+            bdy_patches(bdy_patch_gid) = temp_bdy_patches(bdy_patch_gid);
+        }); // end FOR_ALL bdy_patch_gid
+
+        // find and store the boundary nodes
+        CArrayKokkos<size_t> temp_bdy_nodes(num_nodes, "temp_bdy_nodes");
+        CArrayKokkos<long long int> hash_bdy_nodes(num_nodes, "hash_bdy_nodes");
+
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            hash_bdy_nodes(node_gid) = -1;
+        }); // end for node_gid
+
+        // Parallel loop over boundary patches
+        DCArrayKokkos<size_t> num_bdy_nodes_saved(1, "num_bdy_nodes_saved");
+
+        RUN_CLASS({
+            num_bdy_nodes_saved(0) = 0;
+            for (size_t bdy_patch_gid = 0; bdy_patch_gid < num_bdy_patches; bdy_patch_gid++) {
+                // get the global index of the patch that is on the boundary
+                size_t patch_gid = bdy_patches(bdy_patch_gid);
+
+                // tag the boundary nodes
+                for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                    size_t node_gid = nodes_in_patch(patch_gid, node_lid);
+
+                    if (hash_bdy_nodes(node_gid) < 0) {
+                        hash_bdy_nodes(node_gid) = node_gid;
+                        temp_bdy_nodes(num_bdy_nodes_saved(0)) = node_gid;
+
+                        // printf("bdy_node = %lu \n", node_gid);
+                        num_bdy_nodes_saved(0)++;
+                    } // end if
+                } // end for node_lid
+            } // end for loop over bdy_patch_gid
+        });  // end RUN
+        Kokkos::fence();
+
+        // copy value to host (CPU)
+        num_bdy_nodes_saved.update_host();
+        Kokkos::fence();
+
+        // save the number of bdy_nodes to Mesh_t
+        num_bdy_nodes = num_bdy_nodes_saved.host(0);
+
+        bdy_nodes = CArrayKokkos<size_t>(num_bdy_nodes, "mesh.bdy_nodes");
+
+        FOR_ALL_CLASS(node_gid, 0, num_bdy_nodes, {
+            bdy_nodes(node_gid) = temp_bdy_nodes(node_gid);
+        }); // end for boundary node_gid
+
+        // printf("Num boundary nodes = %lu \n", num_bdy_nodes);
+
+        return;
+    } // end patch connectivity method
+
+    // build the patches
+    void build_node_node_connectivity()
+    {
+        // find the max number of elems around a node
+        size_t max_num_elems_in_node;
+        size_t max_num_lcl;
+        FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, {
+            // num_corners_in_node = num_elems_in_node
+            size_t max_num = num_corners_in_node(node_gid);
+
+            if (max_num > max_num_lcl) {
+                max_num_lcl = max_num;
+            }
+        }, max_num_elems_in_node); // end parallel reduction on max
+        Kokkos::fence();
+
+        // each elem corner will contribute 3 edges to the node. Those edges will likely be the same
+        // ones from an adjacent element so it is a safe estimate to multiply by 3
+        DynamicRaggedRightArrayKokkos<size_t> temp_nodes_in_nodes(num_nodes, max_num_elems_in_node * 3, "temp_nodes_in_nodes");
+
+        num_nodes_in_node = CArrayKokkos<size_t>(num_nodes, "mesh.num_nodes_in_node");
+
+        // walk over the patches and save the node node connectivity
+        RUN_CLASS({
+            if (num_dims == 3) {
+                for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) {
+                    for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) {
+                        // the first node on the edge
+                        size_t node_gid_0 = nodes_in_patch(patch_gid, node_lid);
+
+                        // second node on this edge
+                        size_t node_gid_1;
+
+                        if (node_lid == num_nodes_in_patch - 1) {
+                            node_gid_1 = nodes_in_patch(patch_gid, 0);
+                        }
+                        else {
+                            node_gid_1 = nodes_in_patch(patch_gid, node_lid + 1);
+                        } // end if
+
+                        size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0);
+                        size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1);
+
+                        size_t save_0 = 1;
+                        size_t save_1 = 1;
+
+                        // check to see if the node_gid_1 was already saved
+                        for (size_t contents_lid = 0; contents_lid < num_saved_0; contents_lid++) {
+                            if (temp_nodes_in_nodes(node_gid_0, contents_lid) == node_gid_1) {
+                                save_0 = 0; // don't save, it was already saved
+                            }
+                        }
+
+                        // check to see if the node_gid_0 was already saved
+                        for (size_t contents_lid = 0; contents_lid < num_saved_1; contents_lid++) {
+                            if (temp_nodes_in_nodes(node_gid_1, contents_lid) == node_gid_0) {
+                                save_1 = 0;  // don't save, it was already saved
+                            }
+                        }
+
+                        if (save_0 == 1) {
+                            // increment the number of nodes in a node saved
+                            temp_nodes_in_nodes.stride(node_gid_0)++;
+
+                            // save the second node to the first node
+                            temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1;
+                        }
+
+                        if (save_1 == 1) {
+                            // increment the number of nodes in a node saved
+                            temp_nodes_in_nodes.stride(node_gid_1)++;
+
+                            // save the first node to the second node
+                            temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0;
+                        }
+
+                        // save the strides
+                        num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0);
+                        num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1);
+                    } // end for node in patch
+                } // end for patches
+            } // end if 3D
+            else {
+                for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) {
+                    // the first node on the edge
+                    size_t node_gid_0 = nodes_in_patch(patch_gid, 0);
+
+                    // second node on this edge
+                    size_t node_gid_1 = nodes_in_patch(patch_gid, 1);
+
+                    size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0);
+                    size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1);
+
+                    // increment the number of nodes in a node saved
+                    temp_nodes_in_nodes.stride(node_gid_0)++;
+                    temp_nodes_in_nodes.stride(node_gid_1)++;
+
+                    // save the second node to the first node
+                    temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1;
+
+                    // save the first node to the second node
+                    temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0;
+
+                    // save the strides
+                    num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0);
+                    num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1);
+                } // end for patches
+            } // end if 2D
+        });  // end RUN
+        Kokkos::fence();
+
+        nodes_in_node = RaggedRightArrayKokkos<size_t>(num_nodes_in_node, "mesh.nodes_in_node");
+
+        // save the connectivity
+        FOR_ALL_CLASS(node_gid, 0, num_nodes, {
+            size_t num_saved = 0;
+            for (size_t node_lid = 0; node_lid < num_nodes_in_node(node_gid); node_lid++) {
+                nodes_in_node(node_gid, num_saved) = temp_nodes_in_nodes(node_gid, num_saved);
+
+                // increment the number of nodes in node saved
+                num_saved++;
+            } // end for node_lid
+        }); // end parallel for over nodes
+    } // end of node node connectivity
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn build_connectivity
+    ///
+    /// \brief Calls multiple build connectivity function
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void build_connectivity()
+    {
+        build_corner_connectivity();
+        if (verbose) printf("Built corner connectivity \n");
+
+        build_elem_elem_connectivity();
+        if (verbose) printf("Built element-element connectivity \n");
+
+        build_patch_connectivity();
+        if (verbose) printf("Built patch connectivity \n");
+
+        build_node_node_connectivity();
+        if (verbose) printf("Built node-node connectivity \n");
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn init_bdy_sets
+    ///
+    /// \brief Initialize memory for boundary sets
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void init_bdy_sets(size_t num_bcs)
+    {
+        // if (num_bcs == 0) {
+        //     printf("ERROR: number of boundary sets = 0, set it = 1");
+        //     num_bcs = 1;
+        // }
+        num_bdy_sets = num_bcs;
+        num_bdy_patches_in_set = DCArrayKokkos<size_t>(num_bcs, "mesh.num_bdy_patches_in_set");
+
+        // bdy_patches_in_set is a raggedRight array, it is allocated 
+        // in tag_bdys fcn after the sparsity is known, see geometry_new.cpp
+
+        return;
+    } // end of init_bdy_sets method
+
+    
+}; // end Mesh_t
+
+#endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp
new file mode 100644
index 00000000..c9e143f5
--- /dev/null
+++ b/examples/mesh_decomp/mesh_decomp.cpp
@@ -0,0 +1,101 @@
+// #include <iostream>
+// #include <cstdlib>
+// #include <cstring>
+// #include <vector>
+// #include <memory>
+// #include <mpi.h>
+// #include <set>
+// #include <map>
+
+
+// #include "mesh.h"
+// #include "state.h"
+// #include "mesh_io.h"
+
+#include "decomp_utils.h"
+
+// Include Scotch headers
+#include "scotch.h"
+#include "ptscotch.h"
+
+int main(int argc, char** argv) {
+
+    MPI_Init(&argc, &argv);
+    MATAR_INITIALIZE(argc, argv);
+    { // MATAR scope
+
+    int world_size;
+    int rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    double t_main_start = MPI_Wtime();
+
+    // Mesh size
+    double origin[3] = {0.0, 0.0, 0.0};
+    double length[3] = {1.0, 1.0, 1.0};
+    int num_elems_dim[3] = {30, 30, 30};
+
+    // Initial mesh built on rank zero
+    Mesh_t initial_mesh;
+    node_t initial_node;
+
+    // Mesh partitioned by pt-scotch, including ghost
+    Mesh_t final_mesh;
+    node_t final_node;
+
+    GaussPoint_t gauss_point;
+
+// ********************************************************  
+//              Build the initial mesh
+// ********************************************************  
+
+    double t_init_mesh_start = MPI_Wtime();
+    if (rank == 0) {
+        std::cout<<"World size: "<<world_size<<std::endl;
+        std::cout<<"Rank "<<rank<<" Building initial mesh"<<std::endl;
+
+        std::cout<<"Initializing mesh"<<std::endl;
+        build_3d_box(initial_mesh,  initial_node, origin, length, num_elems_dim);
+
+        // Read the mesh from a file
+        // read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk");
+
+        double t_init_mesh_end = MPI_Wtime();
+        std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    
+// ********************************************************  
+//             Partition and balance the mesh
+// ********************************************************  
+    double t_partition_start = MPI_Wtime();
+    partition_mesh(initial_mesh, final_mesh, initial_node, final_node, gauss_point, world_size, rank);
+    double t_partition_end = MPI_Wtime();
+    
+    
+    
+    if(rank == 0) {
+        printf("Mesh partitioning time: %.2f seconds\n", t_partition_end - t_partition_start);
+    }
+
+    // write_vtk(intermediate_mesh, intermediate_node, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    write_vtu(final_mesh, final_node, gauss_point, rank, MPI_COMM_WORLD);
+    // write_vtk(final_mesh, final_node, rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Stop timer and get execution time
+    double t_main_end = MPI_Wtime();
+    
+    if(rank == 0) {
+        printf("Total execution time: %.2f seconds\n", t_main_end - t_main_start);
+    }
+
+    } // end MATAR scope
+    MATAR_FINALIZE();
+    MPI_Finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_inputs.h b/examples/mesh_decomp/mesh_inputs.h
new file mode 100644
index 00000000..e7619748
--- /dev/null
+++ b/examples/mesh_decomp/mesh_inputs.h
@@ -0,0 +1,141 @@
+/**********************************************************************************************
+� 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+
+#ifndef FIERRO_MESH_INPUT_OPTIONS_H
+#define FIERRO_MESH_INPUT_OPTIONS_H
+
+#include <stdio.h>
+#include "matar.h"
+
+namespace mesh_input
+{
+// source of the mesh
+enum source
+{
+    none = 0,       ///< No source given, should fail
+    generate = 1,   ///< Create the mesh using the mesh builder
+    file = 2,       ///< Read in the mesh from a file
+};
+
+// type of mesh to generate if source = generate
+enum type
+{
+    Box = 0,     // Create the mesh using the mesh builder
+    Polar = 1,   // Create a polar 2D mesh
+};
+} // end of namespace
+
+static std::map<std::string, mesh_input::source> mesh_input_source_map
+{
+    { "generate", mesh_input::generate },
+    { "file", mesh_input::file }
+};
+
+static std::map<std::string, mesh_input::type> mesh_input_type_map
+{
+    { "box", mesh_input::Box },
+    { "polar", mesh_input::Polar }
+};
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct mesh_input_t
+///
+/// \brief Meshing related input parameters
+///
+/////////////////////////////////////////////////////////////////////////////
+struct mesh_input_t
+{
+    int num_dims = 3;   ///< Number of dimensions for the mesh
+    mesh_input::source source = mesh_input::none;   ///< Source of mesh, file or generate
+    std::string file_path     = ""; ///< Absolute path of mesh file
+    mesh_input::type type;          ///< Type of mesh to generate if
+
+    double origin[3]    = { 0.0, 0.0, 0.0 }; ///< Mesh origin for generating a mesh
+    double length[3]    = { 0.0, 0.0, 0.0 }; ///< x,y,z length of generated mesh
+    size_t num_elems[3] = { 1, 1, 1 }; ///< Number of elements along x,y, z for generating a mesh.
+
+    size_t p_order = 1;
+
+    // WARNING, NOT YET PARSED
+    double inner_radius   = 0.0;     ///< Inner radius for generating 2D RZ mesh
+    double outer_radius   = 1.0;     ///< Outer radius for generating 2D RZ mesh
+    double starting_angle = 0.0;     ///< Starting angle in degrees for 2D RZ mesh
+    double ending_angle   = 90;      ///< Ending angle in degrees for 2D RZ mesh
+
+    int num_radial_elems  = 10;     ///< Number of elements in the radial direction for 2DRZ mesh
+    int num_angular_elems = 10;     ///< Number of elements in the radial direction for 2DRZ mesh
+
+    double scale_x = 1.0; ///< Scales mesh x coordinate dimensions
+    double scale_y = 1.0; ///< Scales mesh y coordinate dimensions
+    double scale_z = 1.0; ///< Scales mesh z coordinate dimensions
+
+    DCArrayKokkos <int> object_ids; ///< the object_ids in the vtu full mesh file (from exodus mesh)  
+
+}; // mesh_input_t
+
+// ----------------------------------
+// valid inputs for mesh options
+// ----------------------------------
+static std::vector<std::string> str_mesh_inps
+{
+    "num_dims",
+    "source",
+    "file_path",
+    "type",
+    "origin",
+    "length",
+    "num_elems",
+    "polynomial_order",
+    "inner_radius",
+    "outer_radius",
+    "starting_angle",
+    "ending_angle",
+    "num_radial_elems",
+    "num_angular_elems",
+    "scale_x",
+    "scale_y",
+    "scale_z"
+};
+
+// ----------------------------------
+// required inputs for mesh options
+// ----------------------------------
+static std::vector<std::string> mesh_required_inps
+{
+    "source",
+    "num_dims",
+};
+
+#endif // end Header Guard
\ No newline at end of file
diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h
new file mode 100644
index 00000000..aec7a963
--- /dev/null
+++ b/examples/mesh_decomp/mesh_io.h
@@ -0,0 +1,1061 @@
+#ifndef MESH_IO_H
+#define MESH_IO_H
+
+#include "matar.h"
+#include "mesh.h"
+#include "state.h"
+
+using namespace mtr;
+
+#include <map>
+#include <memory>
+#include <cstring>
+#include <sys/stat.h>
+#include <iostream>
+#include <regex>    // for string pattern recoginition
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>   
+#include <mpi.h>
+#include <string>
+
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn split
+///
+/// \brief Splits a string by a given delimiter
+///
+/// \param Input string
+/// \param delimiter
+///
+/// \return Vector of split string values
+///
+/////////////////////////////////////////////////////////////////////////////
+inline std::vector<std::string> split(std::string s, std::string delimiter)
+{
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    std::string token;
+    std::vector<std::string> res;
+
+    while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+        token     = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+    }
+
+    res.push_back(s.substr(pos_start));
+    return res;
+} // end of split
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn get_id
+///
+/// \brief This gives the index value of the point or the elem
+///
+/// Assumes that the grid has an i,j,k structure
+/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1)
+/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j
+///
+/// \param i index
+/// \param j index
+/// \param k index
+/// \param Number of i indices
+/// \param Number of j indices
+///
+/////////////////////////////////////////////////////////////////////////////
+KOKKOS_INLINE_FUNCTION
+size_t get_id(int i, int j, int k, int num_i, int num_j)
+{
+    return i + j * num_i + k * num_i * num_j;
+} // end get_id
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn PointIndexFromIJK
+///
+/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an 
+/// offset into the local connectivity (PointIds) array. The order parameter
+/// must point to an array of 3 integers specifying the order along each 
+/// axis of the hexahedron.
+///
+/////////////////////////////////////////////////////////////////////////////
+inline int PointIndexFromIJK(int i, int j, int k, const int* order)
+{
+    bool ibdy = (i == 0 || i == order[0]);
+    bool jbdy = (j == 0 || j == order[1]);
+    bool kbdy = (k == 0 || k == order[2]);
+    // How many boundaries do we lie on at once?
+    int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0);
+
+    if (nbdy == 3) { // Vertex DOF
+        // ijk is a corner node. Return the proper index (somewhere in [0,7]):
+        return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0);
+    }
+
+    int offset = 8;
+    if (nbdy == 2) { // Edge DOF
+        if (!ibdy) { // On i axis
+            return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
+        }
+        if (!jbdy) { // On j axis
+            return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset;
+        }
+        // !kbdy, On k axis
+        offset += 4 * (order[0] - 1) + 4 * (order[1] - 1);
+        return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset;
+    }
+
+    offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1);
+    if (nbdy == 1) { // Face DOF
+        if (ibdy) { // On i-normal face
+            return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset;
+        }
+        offset += 2 * (order[1] - 1) * (order[2] - 1);
+        if (jbdy) { // On j-normal face
+            return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset;
+        }
+        offset += 2 * (order[2] - 1) * (order[0] - 1);
+        // kbdy, On k-normal face
+        return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset;
+    }
+
+    // nbdy == 0: Body DOF
+    offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1));
+    return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1)));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn build_3d_box
+///
+/// \brief Builds an unstructured 3D rectilinear mesh
+///
+/// \param Simulation mesh that is built
+/// \param Element state data
+/// \param Node state data
+/// \param origin The origin of the mesh
+/// \param length The length of the mesh
+/// \param num_elems The number of elements in the mesh
+///
+/////////////////////////////////////////////////////////////////////////////
+void build_3d_box(
+    Mesh_t& mesh,
+    node_t&   node,
+    double origin[3],
+    double length[3],
+    int num_elems_dim[3])
+{
+    printf("Creating a 3D box mesh \n");
+
+    const int num_dim = 3;
+
+    // Note: In fierro, these come from the simulation parameters
+    const double lx = length[0];
+    const double ly = length[1];
+    const double lz = length[2];
+
+    // Note: In fierro, these come from the simulation parameters
+    const int num_elems_i = num_elems_dim[0];
+    const int num_elems_j = num_elems_dim[1];
+    const int num_elems_k = num_elems_dim[2];
+
+    const int num_points_i = num_elems_i + 1; // num points in x
+    const int num_points_j = num_elems_j + 1; // num points in y
+    const int num_points_k = num_elems_k + 1; // num points in y
+
+    const int num_nodes = num_points_i * num_points_j * num_points_k;
+
+    const double dx = lx / ((double)num_elems_i);  // len/(num_elems_i)
+    const double dy = ly / ((double)num_elems_j);  // len/(num_elems_j)
+    const double dz = lz / ((double)num_elems_k);  // len/(num_elems_k)
+
+    const int num_elems = num_elems_i * num_elems_j * num_elems_k;
+
+    // --- 3D parameters ---
+    // const int num_faces_in_elem  = 6;  // number of faces in elem
+    // const int num_points_in_elem = 8;  // number of points in elem
+    // const int num_points_in_face = 4;  // number of points in a face
+    // const int num_edges_in_elem  = 12; // number of edges in a elem
+
+    // initialize mesh node variables
+    mesh.initialize_nodes(num_nodes);
+
+        // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers
+    std::vector<node_state> required_node_state = { node_state::coords };
+    node.initialize(num_nodes, num_dim, required_node_state);
+
+    // --- Build nodes ---
+
+    CArrayDual<double> origin_mtr(3, "origin_mtr");
+    origin_mtr.host(0) = origin[0];
+    origin_mtr.host(1) = origin[1];
+    origin_mtr.host(2) = origin[2];
+    origin_mtr.update_device();
+
+    // populate the point data structures
+    FOR_ALL(k, 0, num_points_k,
+            j, 0, num_points_j,
+            i, 0, num_points_i,{
+
+        // global id for the point
+        size_t node_gid = get_id(i, j, k, num_points_i, num_points_j);
+
+        // store the point coordinates
+        node.coords(node_gid, 0) = origin_mtr(0) + (double)i * dx;
+        node.coords(node_gid, 1) = origin_mtr(1) + (double)j * dy;
+        node.coords(node_gid, 2) = origin_mtr(2) + (double)k * dz;
+    });
+    // Update the host side
+    node.coords.update_host();
+
+    // initialize elem variables
+    mesh.initialize_elems(num_elems, num_dim);
+
+    // populate the point data structures
+    FOR_ALL(k, 0, num_elems_k,
+            j, 0, num_elems_j,
+            i, 0, num_elems_i,{
+
+        // global id for the elem
+        size_t elem_gid = get_id(i, j, k, num_elems_i, num_elems_j);
+
+        // store the point IDs for this elem where the range is
+        // (i:i+1, j:j+1, k:k+1) for a linear hexahedron
+        int this_point = 0;
+        for (int kcount = k; kcount <= k + 1; kcount++) {
+            for (int jcount = j; jcount <= j + 1; jcount++) {
+                for (int icount = i; icount <= i + 1; icount++) {
+                    // global id for the points
+                    size_t node_gid = get_id(icount, jcount, kcount,
+                                        num_points_i, num_points_j);
+
+                    // convert this_point index to the FE index convention
+                    int this_index = this_point; //convert_point_number_in_Hex(this_point);
+
+                    // store the points in this elem according the the finite
+                    // element numbering convention
+                    mesh.nodes_in_elem(elem_gid, this_index) = node_gid;
+
+                    // increment the point counting index
+                    this_point++;
+                } // end for icount
+            } // end for jcount
+        }  // end for kcount
+    }); // end parallel for
+
+    // Update the host side
+    mesh.nodes_in_elem.update_host();
+
+    Kokkos::fence();
+
+    // Build connectivity
+    mesh.build_connectivity();
+} // end build_3d_box
+
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn write_vtk
+///
+/// \brief Writes a vtk output file
+///
+/// \param mesh mesh
+/// \param node node data
+/// \param rank rank
+///
+/////////////////////////////////////////////////////////////////////////////
+    void write_vtk(Mesh_t& mesh,
+        node_t& node,
+        int rank)
+    {
+
+        CArray<double> graphics_times(1);
+        int graphics_id = 0;
+        graphics_times(0) = 0.0;
+
+        // ---- Update host data ----
+
+        node.coords.update_host();
+
+        Kokkos::fence();
+
+
+        const int num_cell_scalar_vars = 3;
+        const int num_cell_vec_vars    = 0;
+        const int num_cell_tensor_vars = 0;
+
+        const int num_point_scalar_vars = 3;
+        const int num_point_vec_vars = 2;
+
+
+        // Scalar values associated with a cell
+        const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
+            "rank_id", "elems_in_elem_owned", "global_elem_id"
+        };
+        
+        // const char cell_vec_var_names[num_cell_vec_vars][15] = {
+            
+        // };
+
+        const char point_scalar_var_names[num_point_scalar_vars][15] = {
+            "rank_id", "elems_in_node", "scalar_field"
+        };
+
+        const char point_vec_var_names[num_point_vec_vars][15] = {
+            "pos", "vector_field"
+        };
+
+        // short hand
+        const size_t num_nodes = mesh.num_owned_nodes;
+        const size_t num_elems = mesh.num_owned_elems;
+        const size_t num_dims  = mesh.num_dims;
+
+
+        // save the cell state to an array for exporting to graphics files
+        auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
+        int  elem_switch = 1;
+
+
+        // save the output scale fields to a single 2D array
+
+
+        // export material centeric data to the elements
+
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            elem_fields(elem_gid, 0) = rank;
+            elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid);
+            elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
+        }
+
+
+        // save the vertex vector fields to an array for exporting to graphics files
+        CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
+        CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
+
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            // position, var 0
+            vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0);
+            vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1);
+            vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
+
+            // vector field, var 1
+            vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0);
+            vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1);
+            vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2);
+
+            point_scalar_fields(node_gid, 0) = rank;
+            point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid);
+            point_scalar_fields(node_gid, 2) = node.scalar_field.host(node_gid);
+
+        } // end for loop over vertices
+
+
+        FILE* out[20];   // the output files that are written to
+        char  filename[100]; // char string
+        int   max_len = sizeof filename;
+        int   str_output_len;
+
+        struct stat st;
+
+        if (stat("vtk", &st) != 0) {
+            system("mkdir vtk");
+        }
+
+        // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]);
+
+        //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id);  // mesh file
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtk", graphics_id, rank);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+         // mesh file
+        
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "# vtk DataFile Version 2.0\n");  // part 2
+        fprintf(out[0], "Mesh for Fierro\n");             // part 2
+        fprintf(out[0], "ASCII \n");                      // part 3
+        fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4
+
+        fprintf(out[0], "POINTS %zu float\n", num_nodes);
+
+        // write all components of the point coordinates
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(out[0],
+                    "%f %f %f\n",
+                    node.coords.host(node_gid, 0),
+                    node.coords.host(node_gid, 1),
+                    node.coords.host(node_gid, 2));
+        } // end for
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the elems
+        ---------------------------------------------------------------------------
+        */
+
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELLS %lu %lu\n", num_elems, num_elems + num_elems * mesh.num_nodes_in_elem);  // size=all printed values
+
+        int Pn_order   = mesh.Pn;
+        int order[3]   = { Pn_order, Pn_order, Pn_order };
+
+        // const int num_1D_points = Pn_order+1;
+
+        // write all global point numbers for this elem
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem
+
+            for (int k = 0; k <= Pn_order; k++) {
+                for (int j = 0; j <= Pn_order; j++) {
+                    for (int i = 0; i <= Pn_order; i++) {
+                        size_t node_lid = PointIndexFromIJK(i, j, k, order);
+                        fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid));
+                    }
+                }
+            }
+
+            fprintf(out[0], "\n");
+        } // end for
+
+        // Write the element types
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELL_TYPES %zu \n", num_elems);
+        // VTK_LAGRANGE_HEXAHEDRON: 72,
+        // VTK_HIGHER_ORDER_HEXAHEDRON: 67
+        // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33
+        // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
+        // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
+        // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            fprintf(out[0], "%d \n", 72);
+        }
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the nodal vector variables to file
+        ---------------------------------------------------------------------------
+        */
+
+        fprintf(out[0], "\n");
+        fprintf(out[0], "POINT_DATA %zu \n", num_nodes);
+
+        // vtk vector vars = (position, velocity)
+        for (int var = 0; var < num_point_vec_vars; var++) {
+            fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]);
+            for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+                fprintf(out[0], "%f %f %f\n",
+                        vec_fields(node_gid, var, 0),
+                        vec_fields(node_gid, var, 1),
+                        vec_fields(node_gid, var, 2));
+            } // end for nodes
+        } // end for vec_vars
+
+
+        // vtk scalar vars = (rank_id, elems_in_node)
+        for (int var = 0; var < num_point_scalar_vars; var++) {
+            fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]);
+            fprintf(out[0], "LOOKUP_TABLE default\n");
+            for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+                fprintf(out[0], "%f\n",
+                        point_scalar_fields(node_gid, var));
+            } // end for nodes
+        } // end for scalar_vars
+
+        /*
+        ---------------------------------------------------------------------------
+        Write the scalar elem variable to file
+        ---------------------------------------------------------------------------
+        */
+        fprintf(out[0], "\n");
+        fprintf(out[0], "CELL_DATA %zu \n", num_elems);
+
+        for (int var = 0; var < num_cell_scalar_vars; var++) {
+            fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4]
+            fprintf(out[0], "LOOKUP_TABLE default\n");
+            for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+                fprintf(out[0], "%f\n",  elem_fields(elem_gid, var));
+            } // end for elem
+        } // end for cell scalar_vars
+
+        fclose(out[0]);
+
+        // graphics_times(graphics_id) = time_value;
+
+        // Write time series metadata
+        //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id);  // mesh file
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); 
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+        // mesh file
+
+        out[0] = fopen(filename, "w");
+
+        fprintf(out[0], "{\n");
+        fprintf(out[0], "  \"file-series-version\" : \"1.0\",\n");
+        fprintf(out[0], "  \"files\" : [\n");
+
+        for (int i = 0; i <= graphics_id; i++) {
+            fprintf(out[0], "    { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) );
+        }
+
+        // fprintf(out[0], "%12.5e\n", graphics_times(i));
+        fprintf(out[0], "  ]\n"); // part 4
+        fprintf(out[0], "}"); // part 4
+
+        fclose(out[0]);
+
+        // increment graphics id counter
+        // graphics_id++;
+
+
+    } // end write vtk old
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \fn write_vtu
+///
+/// \brief Writes a VTU (XML VTK) output file per MPI rank and a PVTU file
+///        for parallel visualization in ParaView
+///
+/// \param mesh mesh
+/// \param node node data
+/// \param rank MPI rank
+/// \param comm MPI communicator
+///
+/////////////////////////////////////////////////////////////////////////////
+void write_vtu(Mesh_t& mesh,
+               node_t& node,
+               GaussPoint_t& gauss_point,
+               int rank,
+               MPI_Comm comm)
+{
+    int world_size;
+    MPI_Comm_size(comm, &world_size);
+
+    CArray<double> graphics_times(1);
+    int graphics_id = 0;
+    graphics_times(0) = 0.0;
+
+    // ---- Update host data ----
+    node.coords.update_host();
+    Kokkos::fence();
+
+    const int num_cell_scalar_vars = 4;
+    const int num_cell_vec_vars    = 1;
+    const int num_cell_tensor_vars = 0;
+
+    const int num_point_scalar_vars = 4;
+    const int num_point_vec_vars = 2;
+
+    // Scalar values associated with a cell
+    const char cell_scalar_var_names[num_cell_scalar_vars][30] = {
+        "rank_id", "elems_in_elem_owned", "global_elem_id", "field_value"
+    };
+
+    const char cell_vec_var_names[num_cell_vec_vars][15] = {
+        "field_vec"
+    };
+
+    const char point_scalar_var_names[num_point_scalar_vars][15] = {
+        "rank_id", "elems_in_node", "global_node_id", "scalar_field"
+    };
+
+    const char point_vec_var_names[num_point_vec_vars][15] = {
+        "pos", "vector_field"
+    };
+
+    // short hand
+    const size_t num_nodes = mesh.num_owned_nodes;
+    const size_t num_elems = mesh.num_owned_elems;
+    const size_t num_dims  = mesh.num_dims;
+
+    // save the cell state to an array for exporting to graphics files
+    auto elem_fields = CArray<double>(num_elems, num_cell_scalar_vars);
+    auto elem_vec_fields = CArray<double>(num_elems, num_cell_vec_vars, 3);
+
+    DCArrayKokkos <double> num_elems_in_elem(mesh.num_elems, "tmp_num_elem_in_elem");
+    FOR_ALL(i, 0, mesh.num_elems, {
+        num_elems_in_elem(i) = (double)mesh.num_elems_in_elem(i);
+    });
+    MATAR_FENCE();
+    num_elems_in_elem.update_host();
+    MATAR_FENCE();
+    
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        elem_fields(elem_gid, 0) = rank;
+        elem_fields(elem_gid, 1) = num_elems_in_elem.host(elem_gid);
+        elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid);
+        elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid);
+        elem_vec_fields(elem_gid, 0, 0) = gauss_point.fields_vec.host(elem_gid, 0);
+        elem_vec_fields(elem_gid, 0, 1) = gauss_point.fields_vec.host(elem_gid, 1);
+        elem_vec_fields(elem_gid, 0, 2) = gauss_point.fields_vec.host(elem_gid, 2);
+    }
+
+    // save the vertex vector fields to an array for exporting to graphics files
+    CArray<double> vec_fields(num_nodes, num_point_vec_vars, 3);
+    CArray<double> point_scalar_fields(num_nodes, num_point_scalar_vars);
+
+
+    DCArrayKokkos <double> num_elems_in_node(mesh.num_elems, "tmp_num_elems_in_node");
+    FOR_ALL(i, 0, mesh.num_elems, {
+        num_elems_in_node(i) = (double)mesh.num_corners_in_node(i);
+    });
+    MATAR_FENCE();
+    num_elems_in_node.update_host();
+
+    for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+        // position, var 0
+        vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0);
+        vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1);
+        vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2);
+
+        // vector field, var 1
+        vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0);
+        vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1);
+        vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2);
+
+        point_scalar_fields(node_gid, 0) = rank;
+        point_scalar_fields(node_gid, 1) = num_elems_in_node.host(node_gid);
+        point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid);
+        point_scalar_fields(node_gid, 3) = node.scalar_field.host(node_gid);
+    }
+
+    // File management
+    char filename[200];
+    int max_len = sizeof filename;
+    int str_output_len;
+
+    struct stat st;
+    if (stat("vtk", &st) != 0) {
+        system("mkdir vtk");
+    }
+
+    // Create VTU filename for this rank
+    str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtu", graphics_id, rank);
+    if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+    FILE* vtu_file = fopen(filename, "w");
+    if (!vtu_file) {
+        std::cerr << "[rank " << rank << "] Failed to open VTU file: " << filename << std::endl;
+        return;
+    }
+
+    // Write VTU XML header
+    fprintf(vtu_file, "<?xml version=\"1.0\"?>\n");
+    fprintf(vtu_file, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n");
+    fprintf(vtu_file, "  <UnstructuredGrid>\n");
+    fprintf(vtu_file, "    <Piece NumberOfPoints=\"%zu\" NumberOfCells=\"%zu\">\n", num_nodes, num_elems);
+
+    // Write Points (coordinates)
+    fprintf(vtu_file, "      <Points>\n");
+    fprintf(vtu_file, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n");
+    for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+        fprintf(vtu_file, "          %f %f %f\n",
+                node.coords.host(node_gid, 0),
+                node.coords.host(node_gid, 1),
+                node.coords.host(node_gid, 2));
+    }
+    fprintf(vtu_file, "        </DataArray>\n");
+    fprintf(vtu_file, "      </Points>\n");
+
+    // Write Cells (connectivity)
+    fprintf(vtu_file, "      <Cells>\n");
+    
+    // Connectivity array - all node indices for all cells, space-separated
+    fprintf(vtu_file, "        <DataArray type=\"Int32\" Name=\"connectivity\" format=\"ascii\">\n");
+    int Pn_order = mesh.Pn;
+    int order[3] = { Pn_order, Pn_order, Pn_order };
+    
+    // Write connectivity: all node IDs for all elements, space-separated
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        for (int k = 0; k <= Pn_order; k++) {
+            for (int j = 0; j <= Pn_order; j++) {
+                for (int i = 0; i <= Pn_order; i++) {
+                    size_t node_lid = PointIndexFromIJK(i, j, k, order);
+                    fprintf(vtu_file, " %zu", static_cast<unsigned long>(mesh.nodes_in_elem.host(elem_gid, node_lid)));
+                }
+            }
+        }
+    }
+    fprintf(vtu_file, "\n");
+    fprintf(vtu_file, "        </DataArray>\n");
+
+    // Offsets array - cumulative index where each cell's connectivity ends
+    fprintf(vtu_file, "        <DataArray type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");
+    int offset = 0;
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        offset += static_cast<int>(mesh.num_nodes_in_elem);
+        fprintf(vtu_file, " %d", offset);
+    }
+    fprintf(vtu_file, "\n");
+    fprintf(vtu_file, "        </DataArray>\n");
+
+    // Types array (72 = VTK_LAGRANGE_HEXAHEDRON)
+    fprintf(vtu_file, "        <DataArray type=\"UInt8\" Name=\"types\" format=\"ascii\">\n");
+    for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+        fprintf(vtu_file, " 72");
+    }
+    fprintf(vtu_file, "\n");
+    fprintf(vtu_file, "        </DataArray>\n");
+    fprintf(vtu_file, "      </Cells>\n");
+
+    // Write PointData (node fields)
+    fprintf(vtu_file, "      <PointData>\n");
+    
+    // Point vector variables
+    for (int var = 0; var < num_point_vec_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\" format=\"ascii\">\n", 
+                point_vec_var_names[var]);
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(vtu_file, "          %f %f %f\n",
+                    vec_fields(node_gid, var, 0),
+                    vec_fields(node_gid, var, 1),
+                    vec_fields(node_gid, var, 2));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+
+    // Point scalar variables
+    for (int var = 0; var < num_point_scalar_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", 
+                point_scalar_var_names[var]);
+        for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) {
+            fprintf(vtu_file, "          %f\n", point_scalar_fields(node_gid, var));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+    fprintf(vtu_file, "      </PointData>\n");
+
+    // Write CellData (element fields)
+    fprintf(vtu_file, "      <CellData>\n");
+    
+    // Cell vector variables
+    for (int var = 0; var < num_cell_vec_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\" format=\"ascii\">\n", 
+                cell_vec_var_names[var]);
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            // TODO: Populate cell vector field data from appropriate source
+            fprintf(vtu_file, "          %f %f %f\n", 
+                gauss_point.fields_vec.host(elem_gid, 0), 
+                gauss_point.fields_vec.host(elem_gid, 1), 
+                gauss_point.fields_vec.host(elem_gid, 2));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+    
+    // Cell scalar variables
+    for (int var = 0; var < num_cell_scalar_vars; var++) {
+        fprintf(vtu_file, "        <DataArray type=\"Float32\" Name=\"%s\" format=\"ascii\">\n", 
+                cell_scalar_var_names[var]);
+        for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) {
+            fprintf(vtu_file, "          %f\n", elem_fields(elem_gid, var));
+        }
+        fprintf(vtu_file, "        </DataArray>\n");
+    }
+    fprintf(vtu_file, "      </CellData>\n");
+
+    // Close VTU file
+    fprintf(vtu_file, "    </Piece>\n");
+    fprintf(vtu_file, "  </UnstructuredGrid>\n");
+    fprintf(vtu_file, "</VTKFile>\n");
+    fclose(vtu_file);
+
+    // Write PVTU file (only rank 0, after all ranks have written their VTU files)
+    MPI_Barrier(comm);
+    
+    if (rank == 0) {
+        str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.pvtu", graphics_id);
+        if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); }
+
+        FILE* pvtu_file = fopen(filename, "w");
+        if (!pvtu_file) {
+            std::cerr << "[rank 0] Failed to open PVTU file: " << filename << std::endl;
+            return;
+        }
+
+        // Write PVTU XML header
+        fprintf(pvtu_file, "<?xml version=\"1.0\"?>\n");
+        fprintf(pvtu_file, "<VTKFile type=\"PUnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n");
+        fprintf(pvtu_file, "  <PUnstructuredGrid GhostLevel=\"0\">\n");
+        
+        // Write PPoints
+        fprintf(pvtu_file, "    <PPoints>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"Float32\" NumberOfComponents=\"3\"/>\n");
+        fprintf(pvtu_file, "    </PPoints>\n");
+
+        // Write PCells
+        fprintf(pvtu_file, "    <PCells>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"Int32\" Name=\"connectivity\"/>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"Int32\" Name=\"offsets\"/>\n");
+        fprintf(pvtu_file, "      <PDataArray type=\"UInt8\" Name=\"types\"/>\n");
+        fprintf(pvtu_file, "    </PCells>\n");
+
+        // Write PPointData
+        fprintf(pvtu_file, "    <PPointData>\n");
+        for (int var = 0; var < num_point_vec_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\"/>\n",
+                    point_vec_var_names[var]);
+        }
+        for (int var = 0; var < num_point_scalar_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\"/>\n",
+                    point_scalar_var_names[var]);
+        }
+        fprintf(pvtu_file, "    </PPointData>\n");
+
+        // Write PCellData
+        fprintf(pvtu_file, "    <PCellData>\n");
+        for (int var = 0; var < num_cell_vec_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\" NumberOfComponents=\"3\"/>\n",
+                    cell_vec_var_names[var]);
+        }
+        for (int var = 0; var < num_cell_scalar_vars; var++) {
+            fprintf(pvtu_file, "      <PDataArray type=\"Float32\" Name=\"%s\"/>\n",
+                    cell_scalar_var_names[var]);
+        }
+        fprintf(pvtu_file, "    </PCellData>\n");
+
+        // Write Piece references for each rank
+        for (int r = 0; r < world_size; r++) {
+            fprintf(pvtu_file, "    <Piece Source=\"Fierro.%05d_rank%d.vtu\"/>\n", graphics_id, r);
+        }
+
+        // Close PVTU file
+        fprintf(pvtu_file, "  </PUnstructuredGrid>\n");
+        fprintf(pvtu_file, "</VTKFile>\n");
+        fclose(pvtu_file);
+    }
+
+} // end write_vtu
+
+
+ /////////////////////////////////////////////////////////////////////////////
+    ///
+    /// \fn read_vtk_mesh
+    ///
+    /// \brief Read ASCII .vtk mesh file
+    ///
+    /// \param Simulation mesh
+    /// \param Simulation state
+    /// \param Node state struct
+    /// \param Number of dimensions
+    ///
+    /////////////////////////////////////////////////////////////////////////////
+    void read_vtk_mesh(Mesh_t& mesh,
+        node_t&   node,
+        int num_dims,
+        std::string mesh_file_)
+{
+
+    std::cout<<"Reading VTK mesh"<<std::endl;
+
+    int i;           // used for writing information to file
+    int node_gid;    // the global id for the point
+    int elem_gid;     // the global id for the elem
+
+    size_t num_nodes_in_elem = 1;
+    for (int dim = 0; dim < num_dims; dim++) {
+        num_nodes_in_elem *= 2;
+    }
+
+
+    std::string token;
+
+    bool found = false;
+
+    std::ifstream in;  // FILE *in;
+    in.open(mesh_file_);
+
+
+    // look for POINTS
+    i = 0;
+    while (found==false) {
+        std::string str;
+        std::string delimiter = " ";
+        std::getline(in, str);
+        std::vector<std::string> v = split (str, delimiter);
+
+        // looking for the following text:
+        //      POINTS %d float
+        if(v[0] == "POINTS"){
+            size_t num_nodes = std::stoi(v[1]);
+            printf("Number of nodes read in %zu\n", num_nodes);
+            mesh.initialize_nodes(num_nodes);
+
+            std::vector<node_state> required_node_state = { node_state::coords };
+            node.initialize(num_nodes, num_dims, required_node_state);
+            
+            found=true;
+        } // end if
+
+
+        if (i>1000){
+            std::cerr << "ERROR: Failed to find POINTS in file" << std::endl;
+            break;
+        } // end if
+
+        i++;
+    } // end while
+
+    // read the node coordinates
+    for (node_gid=0; node_gid<mesh.num_nodes; node_gid++){
+
+        std::string str;
+        std::getline(in, str);
+
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+
+        // save the nodal coordinates
+        node.coords.host(node_gid, 0) = std::stod(v[0]); // double
+        node.coords.host(node_gid, 1) = std::stod(v[1]); // double
+        if(num_dims==3){
+            node.coords.host(node_gid, 2) = std::stod(v[2]); // double
+        }
+
+    } // end for nodes
+
+
+    // Update device nodal positions
+    node.coords.update_device();
+
+
+    found=false;
+
+    // look for CELLS
+    i = 0;
+    size_t num_elem = 0;
+    while (found==false) {
+    std::string str;
+    std::getline(in, str);
+
+    std::string delimiter = " ";
+    std::vector<std::string> v = split (str, delimiter);
+    std::cout << v[0] << std::endl; // printing
+
+    // looking for the following text:
+    //      CELLS num_elem size
+    if(v[0] == "CELLS"){
+        num_elem = std::stoi(v[1]);
+        printf("Number of elements read in %zu\n", num_elem);
+
+        // initialize elem variables
+        mesh.initialize_elems(num_elem, num_dims);
+        
+        found=true;
+    } // end if
+
+
+    if (i>1000){
+        printf("ERROR: Failed to find CELLS \n");
+        break;
+    } // end if
+
+    i++;
+    } // end while
+
+
+    // read the node ids in the element
+    for (elem_gid=0; elem_gid<num_elem; elem_gid++) {
+
+    std::string str;
+    std::getline(in, str);
+
+    std::string delimiter = " ";
+    std::vector<std::string> v = split (str, delimiter);
+    num_nodes_in_elem = std::stoi(v[0]);
+
+    for (size_t node_lid=0; node_lid<num_nodes_in_elem; node_lid++){
+        mesh.nodes_in_elem.host(elem_gid, node_lid) = std::stod(v[node_lid+1]);
+        //printf(" %zu ", elem_point_list(elem_gid,node_lid) ); // printing
+    }
+    //printf("\n"); // printing
+
+    } // end for
+
+    // Convert from ensight to IJK mesh
+    size_t convert_ensight_to_ijk[8];
+    convert_ensight_to_ijk[0] = 0;
+    convert_ensight_to_ijk[1] = 1;
+    convert_ensight_to_ijk[2] = 3;
+    convert_ensight_to_ijk[3] = 2;
+    convert_ensight_to_ijk[4] = 4;
+    convert_ensight_to_ijk[5] = 5;
+    convert_ensight_to_ijk[6] = 7;
+    convert_ensight_to_ijk[7] = 6;
+
+    size_t tmp_ijk_indx[8];
+
+    for (size_t elem_gid = 0; elem_gid < num_elem; elem_gid++) {
+    for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) {
+        tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]);
+    }
+
+    for (size_t node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){
+        mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid];
+    }
+    }
+    // update device side
+    mesh.nodes_in_elem.update_device();
+
+
+    // initialize corner variables
+    size_t num_corners = num_elem * num_nodes_in_elem;
+    mesh.initialize_corners(num_corners);
+
+
+    // Build connectivity
+    mesh.build_connectivity();
+
+
+    found=false;
+
+    printf("\n");
+
+
+    // look for CELL_TYPE
+    i = 0;
+    size_t elem_type = 0;
+    while (found==false) {
+    std::string str;
+    std::string delimiter = " ";
+    std::getline(in, str);
+    std::vector<std::string> v = split (str, delimiter);
+
+    // looking for the following text:
+    //      CELLS num_elem size
+    if(v[0] == "CELL_TYPES"){
+
+        std::getline(in, str);
+        elem_type = std::stoi(str);
+        
+        found=true;
+    } // end if
+
+
+    if (i>1000){
+        printf("ERROR: Failed to find elem_TYPE \n");
+        break;
+    } // end if
+
+    i++;
+    } // end while
+    printf("Element type = %zu \n", elem_type);
+    // elem types:
+    // linear hex = 12, linear quad = 9
+    found=false;
+
+
+    if(num_nodes_in_elem==8 & elem_type != 12) {
+    printf("Wrong element type of %zu \n", elem_type);
+    std::cerr << "ERROR: incorrect element type in VTK file" << std::endl;
+    }
+
+    in.close();
+
+} // end of VTKread function
+
+#endif
\ No newline at end of file
diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h
new file mode 100644
index 00000000..eb3d5a6b
--- /dev/null
+++ b/examples/mesh_decomp/state.h
@@ -0,0 +1,206 @@
+/**********************************************************************************************
+� 2020. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+Department of Energy/National Nuclear Security Administration. All rights in the program are
+reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+Security Administration. The Government is granted for itself and others acting on its behalf a
+nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+derivative works, distribute copies to the public, perform publicly and display publicly, and
+to permit others to do so.
+This program is open source under the BSD-3 License.
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+1.  Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************************/
+#ifndef STATE_H
+#define STATE_H
+
+#include "matar.h"
+// #include "mpi_type.h"
+
+using namespace mtr;
+
+
+// Possible node states, used to initialize node_t
+enum class node_state
+{
+    coords,
+    scalar_field,
+    vector_field
+};
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct node_t
+///
+/// \brief Stores state information associated with a node
+///
+/////////////////////////////////////////////////////////////////////////////
+struct node_t
+{
+
+    // Replace with MPIDCArrayKokkos
+    MPICArrayKokkos<double> coords;     ///< Nodal coordinates
+    MPICArrayKokkos<double> coords_n0;  ///< Nodal coordinates at tn=0 of time integration
+    
+    MPICArrayKokkos<double> scalar_field; ///< Scalar field on a node
+    MPICArrayKokkos<double> vector_field; ///< Vector field on a node
+
+
+    // initialization method (num_nodes, num_dims, state to allocate)
+    void initialize(size_t num_nodes, size_t num_dims, std::vector<node_state> node_states)
+    {
+
+        CommunicationPlan comm_plan;
+        
+        for (auto field : node_states){
+            switch(field){
+                case node_state::coords:
+                    if (coords.size() == 0){
+                        this->coords = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates");
+                        this->coords.initialize_comm_plan(comm_plan);
+                    }
+                    if (coords_n0.size() == 0){
+                        this->coords_n0 = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates_n0");
+                        this->coords_n0.initialize_comm_plan(comm_plan);
+                    }
+                    break;
+                case node_state::scalar_field:
+                    if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos<double>(num_nodes, "node_scalar_field");
+                    this->scalar_field.initialize_comm_plan(comm_plan);
+                    break;
+                case node_state::vector_field:
+                    if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos<double>(num_nodes, num_dims, "node_vector_field");
+                    this->vector_field.initialize_comm_plan(comm_plan);
+                    break;
+                default:
+                    std::cout<<"Desired node state not understood in node_t initialize"<<std::endl;
+                    throw std::runtime_error("**** Error in State Field Name ****");
+            }
+        }
+    }; // end method
+    
+    // initialization method (num_nodes, num_dims, state to allocate)
+    void initialize(size_t num_nodes, size_t num_dims, std::vector<node_state> node_states, CommunicationPlan& comm_plan)
+    {
+        for (auto field : node_states){
+            switch(field){
+                case node_state::coords:
+                    if (coords.size() == 0){
+                        this->coords = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates");
+                        this->coords.initialize_comm_plan(comm_plan);
+                    }
+                    if (coords_n0.size() == 0){
+                        this->coords_n0 = MPICArrayKokkos<double>(num_nodes, num_dims, "node_coordinates_n0");
+                        this->coords_n0.initialize_comm_plan(comm_plan);
+                    }
+                    break;
+                case node_state::scalar_field:
+                    if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos<double>(num_nodes, "node_scalar_field");
+                    this->scalar_field.initialize_comm_plan(comm_plan);
+                    break;
+                case node_state::vector_field:
+                    if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos<double>(num_nodes, num_dims, "node_vector_field");
+                    this->vector_field.initialize_comm_plan(comm_plan);
+                    break;
+                default:
+                    std::cout<<"Desired node state not understood in node_t initialize"<<std::endl;
+                    throw std::runtime_error("**** Error in State Field Name ****");
+            }
+        }
+    }; // end method
+
+}; // end node_t
+
+
+// Possible gauss point states, used to initialize GaussPoint_t
+enum class gauss_pt_state
+{
+    fields,
+    fields_vec
+};
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct GaussPoint_t
+///
+/// \brief Stores state information associated with the Gauss point
+///
+/////////////////////////////////////////////////////////////////////////////
+struct GaussPoint_t
+{
+
+    MPICArrayKokkos<double> fields;
+    MPICArrayKokkos<double> fields_vec;
+
+    // initialization method (num_cells, num_dims)
+    void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector<gauss_pt_state> gauss_pt_states, CommunicationPlan& comm_plan)
+    {
+
+        for (auto field : gauss_pt_states){
+            switch(field){
+                case gauss_pt_state::fields:
+                    //if (fields.size() == 0) this->fields = DCArrayKokkos<double>(num_gauss_pnts, "gauss_point_fields");
+                    if (fields.size() == 0){
+                        this->fields = MPICArrayKokkos<double>(num_gauss_pnts, "gauss_point_fields");
+                        this->fields.initialize_comm_plan(comm_plan);
+                    } 
+                    break;
+                case gauss_pt_state::fields_vec:
+                    if (fields_vec.size() == 0){
+                        this->fields_vec = MPICArrayKokkos<double>(num_gauss_pnts, num_dims, "gauss_point_fields_vec");
+                        this->fields_vec.initialize_comm_plan(comm_plan);
+                    } 
+                    break;
+                default:
+                    std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"<<std::endl;
+                    throw std::runtime_error("**** Error in State Field Name ****");
+            }
+        }
+    }; // end method
+};  // end GaussPoint_t
+
+
+
+/////////////////////////////////////////////////////////////////////////////
+///
+/// \struct state_t
+///
+/// \brief Stores all state
+///
+/////////////////////////////////////////////////////////////////////////////
+struct State_t
+{
+    // ---------------------------------------------------------------------
+    //    state data on mesh declarations
+    // ---------------------------------------------------------------------
+    node_t node;              ///< access as node.coords(node_gid,dim)
+    GaussPoint_t GaussPoints; ///< access as GaussPoints.vol(gauss_pt_gid)
+    
+}; // end state_t
+
+
+
+
+
+#endif
\ No newline at end of file
diff --git a/examples/phaseField/srcKokkosVerbose/CMakeLists.txt b/examples/phaseField/srcKokkosVerbose/CMakeLists.txt
index 0da1896c..4f473fd7 100644
--- a/examples/phaseField/srcKokkosVerbose/CMakeLists.txt
+++ b/examples/phaseField/srcKokkosVerbose/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 
 if (KOKKOS)
diff --git a/examples/phaseField/srcMacros/CMakeLists.txt b/examples/phaseField/srcMacros/CMakeLists.txt
index 0da1896c..4f473fd7 100644
--- a/examples/phaseField/srcMacros/CMakeLists.txt
+++ b/examples/phaseField/srcMacros/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 
 if (KOKKOS)
diff --git a/examples/phaseFieldMPI/CMakeLists.txt b/examples/phaseFieldMPI/CMakeLists.txt
index 3650430a..4b8c6961 100644
--- a/examples/phaseFieldMPI/CMakeLists.txt
+++ b/examples/phaseFieldMPI/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 #project (phasefield_mpi)
 
diff --git a/examples/sparsetests/CMakeLists.txt b/examples/sparsetests/CMakeLists.txt
index b8e3164d..a0f4c506 100644
--- a/examples/sparsetests/CMakeLists.txt
+++ b/examples/sparsetests/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 if (KOKKOS)
     #find_package(Kokkos REQUIRED)
diff --git a/examples/test_rocm/CMakeLists.txt b/examples/test_rocm/CMakeLists.txt
index 31c4c2e2..564bb7e3 100644
--- a/examples/test_rocm/CMakeLists.txt
+++ b/examples/test_rocm/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 #project (test_rocm)
 
diff --git a/examples/virtualFcnKokkos/CMakeLists.txt b/examples/virtualFcnKokkos/CMakeLists.txt
index b0673270..89f72fab 100644
--- a/examples/virtualFcnKokkos/CMakeLists.txt
+++ b/examples/virtualFcnKokkos/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 #project (virttestkokkos)
 
diff --git a/examples/virtualFcnMATAR/CMakeLists.txt b/examples/virtualFcnMATAR/CMakeLists.txt
index 4e232051..22873a82 100644
--- a/examples/virtualFcnMATAR/CMakeLists.txt
+++ b/examples/virtualFcnMATAR/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 find_package(Matar REQUIRED)
 
diff --git a/examples/watt-graph/CMakeLists.txt b/examples/watt-graph/CMakeLists.txt
index 9db93716..3061157a 100644
--- a/examples/watt-graph/CMakeLists.txt
+++ b/examples/watt-graph/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 
 if (NOT KOKKOS)
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index 302cb119..f6613c10 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -1,135 +1,510 @@
 #ifndef COMMUNICATION_PLAN_H
 #define COMMUNICATION_PLAN_H
-/**********************************************************************************************
- © 2020. Triad National Security, LLC. All rights reserved.
- This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
- National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
- Department of Energy/National Nuclear Security Administration. All rights in the program are
- reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
- Security Administration. The Government is granted for itself and others acting on its behalf a
- nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
- derivative works, distribute copies to the public, perform publicly and display publicly, and
- to permit others to do so.
- This program is open source under the BSD-3 License.
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
- 
- 1.  Redistributions of source code must retain the above copyright notice, this list of
- conditions and the following disclaimer.
- 
- 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
- conditions and the following disclaimer in the documentation and/or other materials
- provided with the distribution.
- 
- 3.  Neither the name of the copyright holder nor the names of its contributors may be used
- to endorse or promote products derived from this software without specific prior
- written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- **********************************************************************************************/
-
-#include "host_types.h"
-#include "kokkos_types.h"
-#include <typeinfo>
+
 #ifdef HAVE_MPI
 #include <mpi.h>
-#include "partition_map.h"
+#include "matar.h"
 
-namespace mtr
-{
+#include <set>
 
-/////////////////////////
-/* CommunicationPlan:  Class storing relevant data and functions to perform comms between two different MATAR MPI types.
-                       The object for this class should not be reconstructed if the same comm plan is needed repeatedly; the setup is expensive.
-                       The comms routines such as execute_comms can be called repeatedly to avoid repeated setup of the plan.*/
-/////////////////////////
-template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class CommunicationPlan {
+using namespace mtr;
 
-    // this is manage
-    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
-    
-protected:
 
-public:
-    
-    /*forward comms means communicating data to a vector that doesn't have a unique distribution of its global
-      indices amongst processes from a vector that does have a unique distribution amongst processes.
-      An example of forward comms in a finite element application would be communicating ghost data from 
-      the vector of local data.
-
-      reverse comms means communicating data to a vector that has a unique distribution of its global
-      indices amongst processes from a vector that does not have a unique distribution amongst processes.
-      An example of reverse comms in a finite element application would be communicating force contributions from ghost
-      indices via summation to the entries of the uniquely owned vector that stores final tallies of forces.
-    */
-    bool reverse_comms_flag; //default is false
-
-    CommunicationPlan();
-
-    //Copy Constructor
-    CommunicationPlan(const CommunicationPlan<T, Layout, ExecSpace,MemoryTraits> &temp){
-        *this = temp;
-    }
+enum class communication_plan_type {
+    no_communication,
+    all_to_all_graph
+};
+
+
+struct CommunicationPlan {
     
-    CommunicationPlan(bool reverse_comms);
+    // ========================================================================
+    // Metadata for MPI neighbor graph communication 
+    // ========================================================================
 
-    KOKKOS_INLINE_FUNCTION
-    CommunicationPlan& operator=(const CommunicationPlan& temp);
+    communication_plan_type comm_type = communication_plan_type::no_communication;
 
-    // Deconstructor
-    virtual KOKKOS_INLINE_FUNCTION
-    ~CommunicationPlan ();
+    // MPI world communicator
+    MPI_Comm mpi_comm_world;
+    bool has_comm_world = false;
+    int world_size = -1;
 
-    virtual void execute_comms(){}
-}; // End of CommunicationPlan
+    // MPI graph communicator
+    MPI_Comm mpi_comm_graph;
+    bool has_comm_graph = false;
+
+    // Number of send and recv ranks
+    int num_send_ranks;  // In MPI language, this is the outdegree of the graph communicator
+    int num_recv_ranks;  // In MPI language, this is the indegree of the graph communicator
 
+    // Rank IDs for send and recv ranks
+    DCArrayKokkos<int> send_rank_ids;  // [size: num_send_ranks] Destination rank IDs
+    DCArrayKokkos<int> recv_rank_ids;  // [size: num_recv_ranks] Source rank IDs
 
-// Default constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::CommunicationPlan() {
+    // recv_weights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* recv_weights = MPI_UNWEIGHTED; // [size: num_recv_ranks] Weights on incoming edges, set to MPI_UNWEIGHTED if not used
+    
+    // send_weights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED)
+    // Could be used to specify communication volume if needed for optimization
+    int* send_weights = MPI_UNWEIGHTED; // [size: num_send_ranks] Weights on outgoing edges, set to MPI_UNWEIGHTED if not used
     
-}
+    // info: Hints for optimization (MPI_INFO_NULL means use defaults)
+    MPI_Info info = MPI_INFO_NULL;
+    
+    // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering)
+    // Setting to 0 preserves original rank numbering
+    // Note: In the future, we may want to allow MPI to reorder ranks for optimization by setting to 1, 
+    // this would allow MPI to reorder the ranks to make them physically closer on the hardware. 
+    // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs.
+    int reorder = 0; 
 
-// Overloaded 1D constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::CommunicationPlan(bool reverse_comms) {
-    reverse_comms_flag = reverse_comms;
-}
+    DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank
+    DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank
 
+    DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
+    DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
+    
+    
+    DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
+    DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>& CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::operator= (const CommunicationPlan& temp) {
+    int total_send_count;   // Total number of items to send
+    int total_recv_count;   // Total number of items to receive
+
+    // ========================================================================
+    // CONSTRUCTOR / INITIALIZATION
+    // ========================================================================
+    
+    CommunicationPlan() 
+        : num_send_ranks(0), num_recv_ranks(0),
+          has_comm_graph(false) {}
+    
+    
+    // Destructor to free MPI resources
+    ~CommunicationPlan() {
+        // Free graph communicator
+        if (has_comm_graph && mpi_comm_graph != MPI_COMM_NULL) {
+            MPI_Comm_free(&mpi_comm_graph);
+        }
+    }
+    
     
-    // Do nothing if the assignment is of the form x = x
-    if (this != &temp) {
-        reverse_comms_flag = temp.reverse_comms_flag;
+    void initialize(MPI_Comm comm_world){
+        this->mpi_comm_world = comm_world;
+        has_comm_world = true;
+        MPI_Comm_size(comm_world, &world_size);
     }
     
-    return *this;
-}
+    /**
+     * @brief Initialize an MPI distributed graph communicator for sparse neighbor communication.
+     *
+     * This function creates an MPI "dist graph communicator" tailored to the sparse data exchange
+     * patterns typical in mesh-based parallel applications. It establishes direct knowledge for MPI
+     * about which processes (ranks) each process will communicate with. This improves the efficiency 
+     * and clarity of later communication (for example, with MPI_Neighbor_alltoallv).
+     *
+     * This function is especially useful when the communication pattern is not all-to-all, but rather
+     * a sparse subset: for instance, where each process only exchanges data with a few neighbors.
+     *
+     * ==== Key Concepts ====
+     * - MPI Communicator:  An MPI object representing a group of processes that can communicate with each other.
+     *   For context, "MPI_COMM_WORLD" is a communicator including all processes, but a graph communicator
+     *   customizes direct process connections.
+     * - Rank:              Integer ID identifying a process in a communicator.
+     * - Distributed Graph: MPI can represent communication as a directed sparse graph, with edges from
+     *   this rank to those it needs to send to, and from those it will receive from.
+     *
+     * ==== Parameters ====
+     * @param num_send_ranks   [in] Number of ranks this process will send data to (out-neighbors).
+     * @param send_rank_ids    [in] Array of size num_send_ranks; each entry is the rank of a process to send to.
+     * @param num_recv_ranks   [in] Number of ranks this process will receive data from (in-neighbors).
+     * @param recv_rank_ids    [in] Array of size num_recv_ranks; each entry is the rank of a process to receive from.
+     *
+     * ==== Steps ====
+     *
+     * 1. Checks if the basic communicator has been initialized.
+     *    Throws an error if it has not.
+     *
+     * 2. Stores the send/receive neighbor counts and rank lists internally.
+     *    Copies the IDs into the internal device-host arrays.
+     *      - send_rank_ids: process IDs that will be destinations for outgoing messages.
+     *      - recv_rank_ids: process IDs that will provide incoming messages.
+     *
+     * 3. Calls MPI_Dist_graph_create_adjacent:
+     *    This constructs a new MPI communicator ("mpi_comm_graph") that encodes this process's
+     *    inbound and outbound neighbors. MPI uses this to optimize and route messages directly
+     *    and efficiently during later neighbor collectives.
+     *
+     *    - Note: The 'recv_weights' and 'send_weights' arguments are set to NULL here;
+     *            this means we are not giving extra weighting or priorities to any connection.
+     *    - The 'reorder' argument (set to 0 in this class) disables rank reordering;
+     *      this ensures the assignment of process ranks is preserved, which is often needed
+     *      for mapping data or results back to physical entities.
+     *    - On return, 'mpi_comm_graph' will allow use of "neighbor" collectives (MPI_Neighbor_alltoall[v], etc.),
+     *      which automatically use the provided topology to send/receive to only neighbors efficiently.
+     *
+     * 4. Marks the internal flag indicating that the graph communicator has been set up ("has_comm_graph").
+     *
+     * ==== Example Usage ====
+     * Suppose rank 0 will send to ranks 1 and 2, and receive from rank 3 only:
+     *    int send_ranks[2] = {1, 2};
+     *    int recv_ranks[1] = {3};
+     *    initialize_graph_communicator(2, send_ranks, 1, recv_ranks);
+     *
+     * ==== Why Use This? ====
+     * - This avoids the need to do manual pairwise MPI_Send/MPI_Recv in your code, 
+     *   and enables the use of neighbor collectives -- concise, scalable, and hard-to-get-wrong.
+     * - It explicitly tells MPI only about your neighbors, so it can optimize routes and memory.
+     * - If you have a large number of processes or a mesh/network with only local coupling,
+     *   this approach scales much better than using global/all-to-all communication.
+     *
+     * @throws std::runtime_error if the base communicator has not been initialized.
+     */
+    void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){
+        
+        this->comm_type = communication_plan_type::all_to_all_graph;
+        // Check if the MPI_COMM_WORLD communicator has been initialized.
+        if(!has_comm_world){
+            throw std::runtime_error("MPI communicator for the world has not been initialized");
+        }
+        
+        // Store the number of outbound and inbound neighbors
+        this->num_send_ranks = num_send_ranks;
+        this->num_recv_ranks = num_recv_ranks;
+        
+        // Copy and store send neighbor IDs (out-bound neighbors: where we will send data to)
+        this->send_rank_ids = DCArrayKokkos<int>(num_send_ranks, "send_rank_ids");
+        for(int i = 0; i < num_send_ranks; i++){
+            this->send_rank_ids.host(i) = send_rank_ids[i];
+        }
+        this->send_rank_ids.update_device();
+        MATAR_FENCE();
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::~CommunicationPlan() {}
+        // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from)
+        this->recv_rank_ids = DCArrayKokkos<int>(num_recv_ranks, "recv_rank_ids");
+        for(int i = 0; i < num_recv_ranks; i++){
+            this->recv_rank_ids.host(i) = recv_rank_ids[i];
+        }
+        this->recv_rank_ids.update_device();
+        MATAR_FENCE();
+        
+        // Create the distributed graph communicator.
+        // This call links this process to its explicit send and receive neighbors.
+        // See https://www.open-mpi.org/doc/v4.0/man3/MPI_Dist_graph_create_adjacent.3.php for more details.
+        MPI_Dist_graph_create_adjacent(
+            mpi_comm_world,                                       // Existing communicator (usually MPI_COMM_WORLD)
+            num_recv_ranks,                                       // Number of in-neighbors (recv)
+            this->recv_rank_ids.host_pointer(),                   // Array of in-neighbor ranks (who we receive from)
+            recv_weights,                                         // Edge weights (NULL = unweighted)
+            num_send_ranks,                                       // Number of out-neighbors (send)
+            this->send_rank_ids.host_pointer(),                   // Array of out-neighbor ranks (who we send to)
+            send_weights,                                         // Edge weights (NULL = unweighted)
+            info,                                                 // Additional info for MPI (not used, set to MPI_INFO_NULL)
+            reorder,                                              // Allow MPI to reorder ranks for performance (0 disables)
+            &mpi_comm_graph                                       // [out] New graph communicator
+        );
 
-////////////////////////////////////////////////////////////////////////////////
-// End of CommunicationPlan
-////////////////////////////////////////////////////////////////////////////////
+        // Set the internal flag indicating that we have created the MPI distributed graph communicator.
+        has_comm_graph = true;
+    }
+
+    // Useful function for debugging, possibly remove
+    void verify_graph_communicator(){
+        if(!has_comm_graph){
+            throw std::runtime_error("MPI graph communicator has not been initialized");
+        }
+
+        // ============================================================================
+        // Verify the distributed graph communicator
+        // ============================================================================
+        // Query the graph to verify it matches what we specified
+        int indegree_out, outdegree_out, weighted;
+        MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted);
+        
+        // Allocate arrays to receive neighbor information
+        std::vector<int> sources_out(indegree_out);
+        std::vector<int> sourceweights_out(indegree_out);
+        std::vector<int> destinations_out(outdegree_out);
+        std::vector<int> destweights_out(outdegree_out);
+        
+        // Retrieve the actual neighbors from the graph communicator
+        MPI_Dist_graph_neighbors(mpi_comm_graph, 
+                                indegree_out, sources_out.data(), sourceweights_out.data(),
+                                outdegree_out, destinations_out.data(), destweights_out.data());
+        
+        int rank = -1;
+        MPI_Comm_rank(mpi_comm_world, &rank);
+
+        // Additional verification: Check if the queried values match our input
+        bool verification_passed = true;
+        
+        // Print verification information for each rank sequentially
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(mpi_comm_world);
+            if (rank == r) {
+                std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl;
+                std::cout << "  Indegree (receives from " << indegree_out << " ranks): ";
+                for (int i = 0; i < indegree_out; ++i) {
+                    std::cout << sources_out[i] << " ";
+                }
+                std::cout << std::endl;
+                
+                std::cout << "  Outdegree (sends to " << outdegree_out << " ranks): ";
+                for (int i = 0; i < outdegree_out; ++i) {
+                    std::cout << destinations_out[i] << " ";
+                }
+                std::cout << std::endl;
+                
+                std::cout << "  Weighted: " << (weighted ? "yes" : "no") << std::endl;
+            }
+            MPI_Barrier(mpi_comm_world);
+        }
+        
+        // Check if the counts match our stored values
+        if (indegree_out != num_recv_ranks) {
+            std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! "
+                      << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl;
+            verification_passed = false;
+        }
+        if (outdegree_out != num_send_ranks) {
+            std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! "
+                      << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl;
+            verification_passed = false;
+        }
+        
+        // Check if source ranks match (build set from our stored recv_rank_ids)
+        std::set<int> sources_set_in;
+        for (int i = 0; i < num_recv_ranks; ++i) {
+            sources_set_in.insert(recv_rank_ids.host(i));
+        }
+        std::set<int> sources_set_out(sources_out.begin(), sources_out.end());
+        if (sources_set_in != sources_set_out) {
+            std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl;
+            verification_passed = false;
+        }
+        
+        // Check if destination ranks match (build set from our stored send_rank_ids)
+        std::set<int> dests_set_in;
+        for (int i = 0; i < num_send_ranks; ++i) {
+            dests_set_in.insert(send_rank_ids.host(i));
+        }
+        std::set<int> dests_set_out(destinations_out.begin(), destinations_out.end());
+        if (dests_set_in != dests_set_out) {
+            std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl;
+            verification_passed = false;
+        }
+        
+        // Global verification check
+        int local_passed = verification_passed ? 1 : 0;
+        int global_passed = 0;
+        MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
+        MPI_Barrier(mpi_comm_world);
+        if (rank == 0) {
+            if (global_passed) {
+                std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl;
+            } else {
+                std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl;
+            }
+        }
+        MPI_Barrier(mpi_comm_world);
+    }
 
-} // end namespace
+    // Setup send/receive metadata
+    void setup_send_recv(DRaggedRightArrayKokkos<int> &rank_send_ids, DRaggedRightArrayKokkos<int> &rank_recv_ids){
+
+        this->send_indices_ = rank_send_ids; // indices of element data to send to each rank
+        this->recv_indices_ = rank_recv_ids; // indices of element data to receive from each rank
+
+        // Setup send data
+        this->send_counts_ = DCArrayKokkos<int>(num_send_ranks, "send_counts");
+        this->total_send_count = 0;
+        for(int i = 0; i < num_send_ranks; i++){
+            this->send_counts_.host(i) = rank_send_ids.stride_host(i);
+            this->total_send_count += this->send_counts_.host(i);
+        }
+        this->send_counts_.update_device();
+
+        this->send_displs_ = DCArrayKokkos<int>(num_send_ranks, "send_displs");
+        for(int i = 0; i < num_send_ranks; i++){
+            this->send_displs_.host(i) = 0;
+            for(int j = 0; j < i; j++){
+                this->send_displs_.host(i) += this->send_counts_.host(j);
+            }
+        }
+        this->send_displs_.update_device();
+
+        // Setup recv data
+        this->recv_counts_ = DCArrayKokkos<int>(num_recv_ranks, "recv_counts");
+        this->total_recv_count = 0;
+        for(int i = 0; i < num_recv_ranks; i++){
+            this->recv_counts_.host(i) = rank_recv_ids.stride_host(i);
+            this->total_recv_count += this->recv_counts_.host(i);
+        }
+        this->recv_counts_.update_device();
+
+        this->recv_displs_ = DCArrayKokkos<int>(num_recv_ranks, "recv_displs");
+        for(int i = 0; i < num_recv_ranks; i++){
+            this->recv_displs_.host(i) = 0;
+            for(int j = 0; j < i; j++){
+                this->recv_displs_.host(i) += this->recv_counts_.host(j);
+            }
+        }
+        this->recv_displs_.update_device();
+        MATAR_FENCE();
+    }
+
+    // Useful function for debugging, possibly remove
+    void verify_send_recv(){
+        
+        if(!has_comm_graph){
+            throw std::runtime_error("Graph communicator has not been initialized");
+        }
+
+        int rank = -1;
+        MPI_Comm_rank(mpi_comm_world, &rank);
+
+        bool local_verification_passed = true;
+
+        // ============================================================================
+        // Local Verification: Check consistency of counts and displacements
+        // ============================================================================
+
+        // Verify send counts and displacements
+        int computed_total_send = 0;
+        for(int i = 0; i < num_send_ranks; i++){
+            computed_total_send += send_counts_.host(i);
+            
+            // Verify displacements are consistent
+            int expected_displs = 0;
+            for(int j = 0; j < i; j++){
+                expected_displs += send_counts_.host(j);
+            }
+            if(send_displs_.host(i) != expected_displs){
+                std::cerr << "[rank " << rank << "] ERROR: send_displs[" << i << "] mismatch! "
+                          << "Expected " << expected_displs << ", got " << send_displs_.host(i) << std::endl;
+                local_verification_passed = false;
+            }
+        }
+
+        // Verify total send count
+        if(computed_total_send != total_send_count){
+            std::cerr << "[rank " << rank << "] ERROR: total_send_count mismatch! "
+                      << "Expected " << computed_total_send << ", got " << total_send_count << std::endl;
+            local_verification_passed = false;
+        }
+
+        // Verify recv counts and displacements
+        int computed_total_recv = 0;
+        for(int i = 0; i < num_recv_ranks; i++){
+            computed_total_recv += recv_counts_.host(i);
+            
+            // Verify displacements are consistent
+            int expected_displs = 0;
+            for(int j = 0; j < i; j++){
+                expected_displs += recv_counts_.host(j);
+            }
+            if(recv_displs_.host(i) != expected_displs){
+                std::cerr << "[rank " << rank << "] ERROR: recv_displs[" << i << "] mismatch! "
+                          << "Expected " << expected_displs << ", got " << recv_displs_.host(i) << std::endl;
+                local_verification_passed = false;
+            }
+        }
+
+        // Verify total recv count
+        if(computed_total_recv != total_recv_count){
+            std::cerr << "[rank " << rank << "] ERROR: total_recv_count mismatch! "
+                      << "Expected " << computed_total_recv << ", got " << total_recv_count << std::endl;
+            local_verification_passed = false;
+        }
+
+        // Verify send indices are within bounds (basic sanity check)
+        for(int i = 0; i < num_send_ranks; i++){
+            for(int j = 0; j < send_indices_.stride_host(i); j++){
+                int idx = send_indices_.host(i, j);
+                if(idx < 0){
+                    std::cerr << "[rank " << rank << "] ERROR: negative send index at rank " << i 
+                              << ", index " << j << ": " << idx << std::endl;
+                    local_verification_passed = false;
+                }
+            }
+        }
+
+        // Verify recv indices are within bounds (basic sanity check)
+        for(int i = 0; i < num_recv_ranks; i++){
+            for(int j = 0; j < recv_indices_.stride_host(i); j++){
+                int idx = recv_indices_.host(i, j);
+                if(idx < 0){
+                    std::cerr << "[rank " << rank << "] ERROR: negative recv index at rank " << i 
+                              << ", index " << j << ": " << idx << std::endl;
+                    local_verification_passed = false;
+                }
+            }
+        }
+
+        // ============================================================================
+        // Print local verification information for each rank sequentially
+        // ============================================================================
+        for (int r = 0; r < world_size; ++r) {
+            MPI_Barrier(mpi_comm_world);
+            if (rank == r) {
+                std::cout << "\n[rank " << rank << "] Send/Recv Communication Plan Verification:" << std::endl;
+                
+                std::cout << "  Send Configuration:" << std::endl;
+                std::cout << "    - Num send ranks: " << num_send_ranks << std::endl;
+                std::cout << "    - Total send count: " << total_send_count << std::endl;
+                std::cout << "    - Send counts per rank: ";
+                for (int i = 0; i < num_send_ranks; ++i) {
+                    std::cout << send_counts_.host(i) << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "    - Send displacements: ";
+                for (int i = 0; i < num_send_ranks; ++i) {
+                    std::cout << send_displs_.host(i) << " ";
+                }
+                std::cout << std::endl;
+                
+                std::cout << "  Recv Configuration:" << std::endl;
+                std::cout << "    - Num recv ranks: " << num_recv_ranks << std::endl;
+                std::cout << "    - Total recv count: " << total_recv_count << std::endl;
+                std::cout << "    - Recv counts per rank: ";
+                for (int i = 0; i < num_recv_ranks; ++i) {
+                    std::cout << recv_counts_.host(i) << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "    - Recv displacements: ";
+                for (int i = 0; i < num_recv_ranks; ++i) {
+                    std::cout << recv_displs_.host(i) << " ";
+                }
+                std::cout << std::endl;
+            }
+            MPI_Barrier(mpi_comm_world);
+        }
+
+        // ============================================================================
+        // Global Verification: Use MPI to verify consistency across ranks
+        // ============================================================================
+        int local_passed = local_verification_passed ? 1 : 0;
+        int global_passed = 0;
+        MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world);
+        MPI_Barrier(mpi_comm_world);
+
+        if (rank == 0) {
+            if (global_passed) {
+                std::cout << "\n✓ Send/Recv communication plan verification PASSED on all ranks\n" << std::endl;
+            } else {
+                std::cout << "\n✗ Send/Recv communication plan verification FAILED on one or more ranks\n" << std::endl;
+            }
+        }
+        MPI_Barrier(mpi_comm_world);
+
+        if(!global_passed){
+            throw std::runtime_error("Send/Recv communication plan verification failed");
+        }
+    }
+}; // End of CommunicationPlan
 
-#endif // end if have MPI
+#endif // end if HAVE_MPI
+#endif // end if COMMUNICATION_PLAN_H
 
-#endif // COMMUNICATION_PLAN_H
 
diff --git a/src/include/communication_plan_old.h b/src/include/communication_plan_old.h
new file mode 100644
index 00000000..302cb119
--- /dev/null
+++ b/src/include/communication_plan_old.h
@@ -0,0 +1,135 @@
+#ifndef COMMUNICATION_PLAN_H
+#define COMMUNICATION_PLAN_H
+/**********************************************************************************************
+ © 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+
+#include "host_types.h"
+#include "kokkos_types.h"
+#include <typeinfo>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#include "partition_map.h"
+
+namespace mtr
+{
+
+/////////////////////////
+/* CommunicationPlan:  Class storing relevant data and functions to perform comms between two different MATAR MPI types.
+                       The object for this class should not be reconstructed if the same comm plan is needed repeatedly; the setup is expensive.
+                       The comms routines such as execute_comms can be called repeatedly to avoid repeated setup of the plan.*/
+/////////////////////////
+template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+class CommunicationPlan {
+
+    // this is manage
+    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    
+protected:
+
+public:
+    
+    /*forward comms means communicating data to a vector that doesn't have a unique distribution of its global
+      indices amongst processes from a vector that does have a unique distribution amongst processes.
+      An example of forward comms in a finite element application would be communicating ghost data from 
+      the vector of local data.
+
+      reverse comms means communicating data to a vector that has a unique distribution of its global
+      indices amongst processes from a vector that does not have a unique distribution amongst processes.
+      An example of reverse comms in a finite element application would be communicating force contributions from ghost
+      indices via summation to the entries of the uniquely owned vector that stores final tallies of forces.
+    */
+    bool reverse_comms_flag; //default is false
+
+    CommunicationPlan();
+
+    //Copy Constructor
+    CommunicationPlan(const CommunicationPlan<T, Layout, ExecSpace,MemoryTraits> &temp){
+        *this = temp;
+    }
+    
+    CommunicationPlan(bool reverse_comms);
+
+    KOKKOS_INLINE_FUNCTION
+    CommunicationPlan& operator=(const CommunicationPlan& temp);
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~CommunicationPlan ();
+
+    virtual void execute_comms(){}
+}; // End of CommunicationPlan
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::CommunicationPlan() {
+    
+}
+
+// Overloaded 1D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::CommunicationPlan(bool reverse_comms) {
+    reverse_comms_flag = reverse_comms;
+}
+
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>& CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::operator= (const CommunicationPlan& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        reverse_comms_flag = temp.reverse_comms_flag;
+    }
+    
+    return *this;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::~CommunicationPlan() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of CommunicationPlan
+////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace
+
+#endif // end if have MPI
+
+#endif // COMMUNICATION_PLAN_H
+
diff --git a/src/include/mapped_mpi_types.h b/src/include/mapped_mpi_types.h
index 6d5d18d3..ed690ca6 100644
--- a/src/include/mapped_mpi_types.h
+++ b/src/include/mapped_mpi_types.h
@@ -45,7 +45,6 @@
 #include <mpi.h>
 #include <mpi_types.h>
 #include "partition_map.h"
-#include "communication_plan.h"
 
 namespace mtr
 {
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index b10a57fc..5f83265b 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -1,121 +1,148 @@
-#ifndef MPI_TYPES_H
-#define MPI_TYPES_H
-/**********************************************************************************************
- © 2020. Triad National Security, LLC. All rights reserved.
- This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
- National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
- Department of Energy/National Nuclear Security Administration. All rights in the program are
- reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
- Security Administration. The Government is granted for itself and others acting on its behalf a
- nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
- derivative works, distribute copies to the public, perform publicly and display publicly, and
- to permit others to do so.
- This program is open source under the BSD-3 License.
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
- 
- 1.  Redistributions of source code must retain the above copyright notice, this list of
- conditions and the following disclaimer.
- 
- 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
- conditions and the following disclaimer in the documentation and/or other materials
- provided with the distribution.
- 
- 3.  Neither the name of the copyright holder nor the names of its contributors may be used
- to endorse or promote products derived from this software without specific prior
- written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- **********************************************************************************************/
-
-#include "host_types.h"
-#include "kokkos_types.h"
-#include <typeinfo>
+#ifndef MPICARRAYKOKKOS_H
+#define MPICARRAYKOKKOS_H
+
 #ifdef HAVE_MPI
 #include <mpi.h>
+#include "matar.h"
+#include "communication_plan.h"
 
 namespace mtr
 {
 
+// Type trait to map C++ types to MPI_Datatype
+template <typename T>
+struct mpi_type_map {
+    static MPI_Datatype value() {
+        static_assert(sizeof(T) == 0, "Unsupported type for MPI communication");
+        return MPI_DATATYPE_NULL;
+    }
+};
+
+// Specializations for common types
+template <>
+struct mpi_type_map<int> {
+    static MPI_Datatype value() { return MPI_INT; }
+};
+
+template <>
+struct mpi_type_map<long> {
+    static MPI_Datatype value() { return MPI_LONG; }
+};
+
+template <>
+struct mpi_type_map<long long> {
+    static MPI_Datatype value() { return MPI_LONG_LONG; }
+};
+
+template <>
+struct mpi_type_map<unsigned int> {
+    static MPI_Datatype value() { return MPI_UNSIGNED; }
+};
+
+template <>
+struct mpi_type_map<unsigned long> {
+    static MPI_Datatype value() { return MPI_UNSIGNED_LONG; }
+};
+
+template <>
+struct mpi_type_map<float> {
+    static MPI_Datatype value() { return MPI_FLOAT; }
+};
+
+template <>
+struct mpi_type_map<double> {
+    static MPI_Datatype value() { return MPI_DOUBLE; }
+};
+
+template <>
+struct mpi_type_map<char> {
+    static MPI_Datatype value() { return MPI_CHAR; }
+};
+
+template <>
+struct mpi_type_map<unsigned char> {
+    static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; }
+};
+
+template <>
+struct mpi_type_map<bool> {
+    static MPI_Datatype value() { return MPI_C_BOOL; }
+};
+
+
 /////////////////////////
-// MPIArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+// MPICArrayKokkos:  Type for managing distributed data on both CPU and GPU.
 /////////////////////////
 template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class MPIArrayKokkos {
+class MPICArrayKokkos {
+
+    // Dual view for managing data on both CPU and GPU
+    DCArrayKokkos<T> this_array_;
 
-    // this is manage
-    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    DCArrayKokkos<T> send_buffer_;
+    DCArrayKokkos<T> recv_buffer_;
     
 protected:
-    size_t dims_[7];
-    size_t length_;
-    size_t order_;  // tensor order (rank)
-    int mpi_recv_rank_;
-    int mpi_tag_;
+    size_t dims_[7] = {0,0,0,0,0,0,0};
+    size_t length_ = 0;
+    size_t order_ = 0;  // tensor order (rank)
+
     MPI_Comm mpi_comm_;
     MPI_Status mpi_status_;
     MPI_Datatype mpi_datatype_;
     MPI_Request mpi_request_;
-    TArray1D this_array_;
-    
-    void set_mpi_type();
 
-public:
-    // Data member to access host view
-    ViewCArray <T> host;
-
-    MPIArrayKokkos();
     
-    MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    // --- Ghost Communication Support ---
+    CommunicationPlan* comm_plan_ = NULL;      // Pointer to shared communication plan
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
-
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
+    DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
+    DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
+    DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    size_t stride_; // [size: num_dims] Number of contiguous values per first index element
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
-                 size_t dim3, size_t dim4, size_t dim5,
-                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    DRaggedRightArrayKokkos<int> send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank
+    DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank
+    
     
-    // These functions can setup the data needed for halo send/receives
-    // Not necessary for standard MPI comms
-    void mpi_setup();
+    size_t num_owned_;            // Number of owned items (nodes/elements)
+    size_t num_ghost_;            // Number of ghost items (nodes/elements)
 
-    void mpi_setup(int recv_rank);
+public:
+    // Data member to access host view (initialized as pointer to this_array_.host_pointer())
+    ViewCArray <T> host;
 
-    void mpi_setup(int recv_rank, int tag);
 
-    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
+    // Note, consider this for sending blocks without dealing with stride_
+    // MPI_Datatype vector_type;
+    // MPI_Type_contiguous(stride_, mpi_type_map<T>::value(), &vector_type);
+    // MPI_Type_commit(&vector_type);
 
-    void mpi_set_rank(int recv_rank);
+    MPICArrayKokkos();
+    
+    MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    void mpi_set_tag(int tag);
+    MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    void mpi_set_comm(MPI_Comm comm);
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    int get_rank();
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    int get_tag();
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    MPI_Comm get_comm();
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
+    MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5,
+                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i) const;
 
@@ -140,7 +167,52 @@ class MPIArrayKokkos {
                   size_t n, size_t o) const;
     
     KOKKOS_INLINE_FUNCTION
-    MPIArrayKokkos& operator=(const MPIArrayKokkos& temp);
+    MPICArrayKokkos& operator=(const MPICArrayKokkos& temp);
+
+
+    // Method to set comm plan for halo communication
+    void initialize_comm_plan(CommunicationPlan& comm_plan){
+        comm_plan_ = &comm_plan;
+
+        if(comm_plan_->comm_type == communication_plan_type::no_communication){
+            return;
+        }
+        
+        size_t send_size = comm_plan_->total_send_count * stride_;
+        size_t recv_size = comm_plan_->total_recv_count * stride_;
+        
+        if (send_size > 0) {
+            send_buffer_ = DCArrayKokkos<T>(send_size, "send_buffer");
+        }
+        if (recv_size > 0) {
+            recv_buffer_ = DCArrayKokkos<T>(recv_size, "recv_buffer");
+        }
+
+        if (comm_plan_->num_send_ranks > 0) {
+            send_counts_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_counts");
+            send_displs_ = DCArrayKokkos<int>(comm_plan_->num_send_ranks, "send_displs");
+            
+            for(int i = 0; i < comm_plan_->num_send_ranks; i++){
+                send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_;
+                send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_;
+            }
+            send_counts_.update_device();
+            send_displs_.update_device();
+        }
+        
+        if (comm_plan_->num_recv_ranks > 0) {
+            recv_counts_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_counts");
+            recv_displs_ = DCArrayKokkos<int>(comm_plan_->num_recv_ranks, "recv_displs");
+            
+            for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
+                recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_;
+                recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_;
+            }
+            recv_counts_.update_device();
+            recv_displs_.update_device();
+        }
+    };
+
 
     // GPU Method
     // Method that returns size
@@ -168,7 +240,7 @@ class MPIArrayKokkos {
 
     // Method returns kokkos dual view
     KOKKOS_INLINE_FUNCTION
-    TArray1D get_kokkos_dual_view() const;
+    Kokkos::DualView<T*, Layout, ExecSpace, MemoryTraits> get_kokkos_dual_view() const;
 
     // Method that update host view
     void update_host();
@@ -176,167 +248,188 @@ class MPIArrayKokkos {
     // Method that update device view
     void update_device();
 
-    // MPI send wrapper
-    void send(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI recieve wrapper
-    void recv(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI broadcast wrapper
-    void broadcast(size_t count, int root, MPI_Comm comm);
-
-    // MPI scatter wrapper
-    void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
-
-    // MPI gather wrapper
-    void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
-
-    // MPI allgather wrapper
-    void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
-
-    // MPI send wrapper
-    void isend(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI recieve wrapper
-    void irecv(size_t count, int dest, int tag, MPI_Comm comm);
-
-    // MPI wait wrapper for sender
-    void wait_send();
-
-    // MPI wait wrapper for receiver
-    void wait_recv();
-
-    // MPI barrier wrapper
-    //void barrier(MPI_Comm comm);
-
-    // MPI send wrapper
-    void halo_send();
-
-    // MPI recieve wrapper
-    void halo_recv();
-
-    // MPI send wrapper
-    void halo_isend();
-
-    // MPI recieve wrapper
-    void halo_irecv();
+    // Method that builds the send buffer, note, this has to be ordered
+    // Such that all the boundary elements going to a given rank are contiguous in the send buffer.
+    void fill_send_buffer(){
+
+        // Copy this_array_ to the host
+        this_array_.update_host();
+        MATAR_FENCE();
+
+        size_t send_idx = 0;
+        for(int i = 0; i < comm_plan_->num_send_ranks; i++){
+            for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){
+                size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send
+                
+                // Copy all values associated with this element (handles multi-dimensional arrays)
+                for(size_t k = 0; k < stride_; k++){
+                    send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k];
+                }
+                send_idx += stride_;
+            }
+        }
+    };
+
+    // Method that copies the recv buffer into the this_array
+    void copy_recv_buffer(){
+
+        size_t recv_idx = 0;
+        for(int i = 0; i < comm_plan_->num_recv_ranks; i++){
+            for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){
+                size_t dest_idx = comm_plan_->recv_indices_.host(i, j);
+                
+                // Copy all values associated with this element (handles multi-dimensional arrays)
+                for(size_t k = 0; k < stride_; k++){
+                    this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k);
+                }
+                
+                recv_idx += stride_;
+            }
+        }
+    };
+
+
+    // Note: This "may" be needed, im not sure.  Currently, it works....
+        // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior)
+        // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr;
+        // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr;
+        // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr;
+        // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr;
+        // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr;
+        // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr;
+
+    // Method that communicates the data between the ranks
+    // NOTE: This is a blocking communication operation, 
+    // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv
+    
+    // TODO: Replace this with persistent communicator:
+    // MPI_Request req;
+
+    // // Create persistent operation ONCE
+    // MPI_Neighbor_alltoallv_init(
+    //     sendbuf, sendcounts, sdispls, mpi_type_map<T>::value(),
+    //     recvbuf, recvcounts, rdispls, mpi_type_map<T>::value(),
+    //     comm_plan_->mpi_comm_graph,
+    //     MPI_INFO_NULL,
+    //     &req);
+
+    // // Then inside time step loop:
+    // MPI_Start(&req);
+    // // modify sendbuf in-place as needed
+    // MPI_Wait(&req);
+
+    void communicate(){
+
+        fill_send_buffer();
+
+        MPI_Neighbor_alltoallv(
+            send_buffer_.host_pointer(),
+            send_counts_.host_pointer(),
+            send_displs_.host_pointer(),
+            mpi_type_map<T>::value(),  // MPI_TYPE
+            recv_buffer_.host_pointer(),
+            recv_counts_.host_pointer(),
+            recv_displs_.host_pointer(), 
+            mpi_type_map<T>::value(),  // MPI_TYPE
+            comm_plan_->mpi_comm_graph);
+        
+        copy_recv_buffer();
+        this_array_.update_device();
+        MATAR_FENCE();
+    };
+
+    void set_values(const T& value){
+        this_array_.set_values(value);
+    };
 
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
-    ~MPIArrayKokkos ();
-}; // End of MPIArrayKokkos
-
+    ~MPICArrayKokkos ();
+}; // End of MPIDArrayKokkos
 
 // Default constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos() {
-    length_ = order_ = 0;
-    for (int i = 0; i < 7; i++) {
-        dims_[i] = 0;
-    }
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos()
+    : this_array_(), stride_(1), length_(0), order_(0) {
+        for (int i = 0; i < 7; i++) {
+            dims_[i] = 0;
+        }
 }
 
 // Overloaded 1D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, const std::string& tag_string) 
+    : stride_(1), length_(dim0), order_(1) {
     dims_[0] = dim0;
-    order_ = 1;
-    length_ = dim0;
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0);
 }
 
 // Overloaded 2D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) 
+    : stride_(dim1), length_(dim0 * dim1), order_(2) {
     dims_[0] = dim0;
     dims_[1] = dim1;
-    order_ = 2;
-    length_ = (dim0 * dim1);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1);
-    set_mpi_type();
+
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1);
 }
 
+// Overloaded 3D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) 
+    : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
-    order_ = 3;
-    length_ = (dim0 * dim1 * dim2);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2);
 }
 
+// Overloaded 4D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
     dims_[3] = dim3;
-    order_ = 4;
-    length_ = (dim0 * dim1 * dim2 * dim3);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3);
 }
 
+// Overloaded 5D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3,
-                              size_t dim4, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
     dims_[3] = dim3;
     dims_[4] = dim4;
-    order_ = 5;
-    length_ = (dim0 * dim1 * dim2 * dim3 * dim4);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4);
 }
 
+// Overloaded 6D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3,
-                              size_t dim4, size_t dim5, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
     dims_[3] = dim3;
     dims_[4] = dim4;
     dims_[5] = dim5;
-    order_ = 6;
-    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5);
 }
 
+// Overloaded 7D constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
-                              size_t dim2, size_t dim3,
-                              size_t dim4, size_t dim5,
-                              size_t dim6, const std::string& tag_string) {
-    
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) 
+    : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) {
     dims_[0] = dim0;
     dims_[1] = dim1;
     dims_[2] = dim2;
@@ -344,441 +437,192 @@ MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, siz
     dims_[4] = dim4;
     dims_[5] = dim5;
     dims_[6] = dim6;
-    order_ = 7;
-    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6);
-    this_array_ = TArray1D(tag_string, length_);
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
-    set_mpi_type();
+    this_array_ = DCArrayKokkos<T>(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+    host = ViewCArray <T> (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
-    if (typeid(T).name() == typeid(bool).name()) {
-        mpi_datatype_ = MPI_C_BOOL;
-    }
-    else if (typeid(T).name() == typeid(int).name()) {
-        mpi_datatype_ = MPI_INT;
-    }
-    else if (typeid(T).name() == typeid(long int).name()) {
-        mpi_datatype_ = MPI_LONG;
-    }
-    else if (typeid(T).name() == typeid(long long int).name()) {
-        mpi_datatype_ = MPI_LONG_LONG_INT;
-    }
-    else if (typeid(T).name() == typeid(float).name()) {
-        mpi_datatype_ = MPI_FLOAT;
-    }
-    else if (typeid(T).name() == typeid(double).name()) {
-        mpi_datatype_ = MPI_DOUBLE;
-    }
-    else {
-        printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n");
-        mpi_datatype_ = MPI_INT;
-    }
-}
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
-    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!");
-    return this_array_.d_view(i);
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!");
+    return this_array_(i);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
-    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!");
-    return this_array_.d_view(j + (i * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!");
+    return this_array_(i, j);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
-    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!");
-    return this_array_.d_view(k + (j * dims_[2])
-                                + (i * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!");
+    return this_array_(i, j, k);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
-    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!");
-    return this_array_.d_view(l + (k * dims_[3])
-                                + (j * dims_[3] * dims_[2])
-                                + (i * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!");
+    return this_array_(i, j, k, l);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
-                               size_t m) const {
-    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!");
-    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!");
-    return this_array_.d_view(m + (l * dims_[4])
-                                + (k * dims_[4] * dims_[3])
-                                + (j * dims_[4] * dims_[3] * dims_[2])
-                                + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!");
+    return this_array_(i, j, k, l, m);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
-                               size_t m, size_t n) const {
-    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!");
-    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!");
-    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!");
-    return this_array_.d_view(n + (m * dims_[5])
-                                + (l * dims_[5] * dims_[4])
-                                + (k * dims_[5] * dims_[4] * dims_[3])
-                                + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2])
-                                + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!");
+    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!");
+    return this_array_(i, j, k, l, m, n);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
-                               size_t m, size_t n, size_t o) const {
-    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!");
-    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!");
-    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!");
-    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!");
-    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!");
-    assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!");
-    return this_array_.d_view(o + (n * dims_[6])
-                                + (m * dims_[6] * dims_[5])
-                                + (l * dims_[6] * dims_[5] * dims_[4])
-                                + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3])
-                                + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2])
-                                + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+T& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!");
+    assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!");
+    assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!");
+    assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!");
+    assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!");
+    assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!");
+    assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!");
+    assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!");
+    return this_array_(i, j, k, l, m, n, o);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator= (const MPIArrayKokkos& temp) {
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator=(const MPICArrayKokkos& temp) {
     
     // Do nothing if the assignment is of the form x = x
     if (this != &temp) {
+
+        this_array_ = temp.this_array_;
+        send_buffer_ = temp.send_buffer_;
+        recv_buffer_ = temp.recv_buffer_;
+
+        length_ = temp.length_;
+
         for (int iter = 0; iter < temp.order_; iter++){
             dims_[iter] = temp.dims_[iter];
         } // end for
 
         order_ = temp.order_;
-        length_ = temp.length_;
-        this_array_ = temp.this_array_;
-        host = temp.host;
-        mpi_recv_rank_ = temp.mpi_recv_rank_;
-        mpi_tag_ = temp.mpi_tag_;
-        mpi_comm_ = temp.mpi_comm_;
+
         mpi_status_ = temp.mpi_status_;
         mpi_datatype_ = temp.mpi_datatype_;
         mpi_request_ = temp.mpi_request_;
+        comm_plan_ = temp.comm_plan_;
+
+        send_counts_ = temp.send_counts_;
+        recv_counts_ = temp.recv_counts_;
+        send_displs_ = temp.send_displs_;
+        recv_displs_ = temp.recv_displs_;
+        stride_ = temp.stride_;
+
+        send_indices_ = temp.send_indices_;
+        recv_indices_ = temp.recv_indices_;
+
+        num_owned_ = temp.num_owned_;
+        num_ghost_ = temp.num_ghost_;
+
+        host = temp.host;  // Also copy the host ViewCArray
     }
-    
     return *this;
 }
 
 // Return size
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
-    return length_;
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return this_array_.size();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
-    return length_;
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return this_array_.extent();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
-    assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
-    assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!");
-    return dims_[i];
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
+    assert(dims_[i] > 0 && "Access to MPICArrayKokkos dims is out of bounds!");
+    return this_array_.dims(i);
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
-    return order_;
+size_t MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
+    return this_array_.order();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
-    return this_array_.d_view.data();
+T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.device_pointer();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
-    return this_array_.h_view.data();
+T* MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+    return this_array_.host_pointer();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
-  return this_array_;
+Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+    return this_array_.get_kokkos_dual_view();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
-
-    this_array_.template modify<typename TArray1D::execution_space>();
-    this_array_.template sync<typename TArray1D::host_mirror_space>();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
-
-    this_array_.template modify<typename TArray1D::host_mirror_space>();
-    this_array_.template sync<typename TArray1D::execution_space>();
-}
-
-// a default setup, should not be used except for testing
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup() {
-    mpi_recv_rank_ = 1;
-    mpi_tag_ = 99;
-    mpi_comm_ = MPI_COMM_WORLD;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank) {
-    mpi_recv_rank_ = recv_rank;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag) {
-    mpi_recv_rank_ = recv_rank;
-    mpi_tag_ = tag;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag, MPI_Comm comm) {
-    mpi_recv_rank_ = recv_rank;
-    mpi_tag_ = tag;
-    mpi_comm_ = comm;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_rank(int recv_rank) {
-    mpi_recv_rank_ = recv_rank;
+void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+    this_array_.update_host();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_tag(int tag) {
-    mpi_tag_ = tag;
+void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+    this_array_.update_device();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_comm(MPI_Comm comm) {
-    mpi_comm_ = comm;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_rank() {
-    return mpi_recv_rank_;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_tag() {
-    return mpi_tag_;
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-MPI_Comm MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_comm() {
-    return mpi_comm_;
-}
-
-//MPI_Send wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::send(size_t count, int dest, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); 
-#else
-    update_host();
-    MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); 
-#endif
-}
-
-//MPI_Recv wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::recv(size_t count, int source, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
-#else
-    MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
-    update_device();
-#endif
-}
-
-//MPI_Send halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_send() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
-#else
-    update_host();
-    MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
-#endif
-}
-
-//MPI_Recv halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_recv() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
-#else
-    MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
-    update_device();
-#endif
-}
-
-//MPI_iSend halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_isend() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#else
-    update_host();
-    MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#endif
-}
-
-//MPI_iRecv halo wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_irecv() {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#else
-    MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
-#endif
-}
-
-//MPI_Bcast wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::broadcast(size_t count, int root, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); 
-#else
-    update_host();
-    MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); 
-    update_device();
-#endif
-}
-
-//MPI_Scatter wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
-#else
-    update_host();
-    MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
-    recv_buffer.update_device();
-#endif
-}
-
-//MPI_Gather wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
-#else
-    update_host();
-    MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
-    recv_buffer.update_device();
-#endif
-}
-
-//MPI_AllGather wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); 
-#else
-    update_host();
-    MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); 
-    recv_buffer.update_device();
-#endif
-}
-
-//MPI_Isend wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::isend(size_t count, int dest, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
-#else
-    update_host();
-    MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
-#endif
-}
-
-//MPI_Irecv wrapper
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::irecv(size_t count, int source, int tag, MPI_Comm comm) {
-#ifdef HAVE_GPU_AWARE_MPI
-    MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
-#else
-    MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
-#endif
-}
-
-//MPI_Wait wrapper for the sender
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_send() {
-    MPI_Wait(&mpi_request_, &mpi_status_); 
-}
+KOKKOS_INLINE_FUNCTION
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
 
-//MPI_Wait wrapper for the receiver
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_recv() {
-    MPI_Wait(&mpi_request_, &mpi_status_); 
-#ifndef HAVE_GPU_AWARE_MPI
-    update_device();
-#endif
 }
 
-//MPI_Barrier wrapper
-//template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-//void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
-//    MPI_Barrier(comm); 
-//}
+} // end namespace mtr
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPIArrayKokkos() {}
-
-////////////////////////////////////////////////////////////////////////////////
-// End of MPIArrayKokkos
-////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace
 
 #endif // end if have MPI
-
-#endif // MPI_TYPES_H
-
+#endif // end if MPICARRAYKOKKOS_H
\ No newline at end of file
diff --git a/src/include/mpi_types_old.h b/src/include/mpi_types_old.h
new file mode 100644
index 00000000..b10a57fc
--- /dev/null
+++ b/src/include/mpi_types_old.h
@@ -0,0 +1,784 @@
+#ifndef MPI_TYPES_H
+#define MPI_TYPES_H
+/**********************************************************************************************
+ © 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+
+#include "host_types.h"
+#include "kokkos_types.h"
+#include <typeinfo>
+#ifdef HAVE_MPI
+#include <mpi.h>
+
+namespace mtr
+{
+
+/////////////////////////
+// MPIArrayKokkos:  Dual type for managing distributed data on both CPU and GPU.
+/////////////////////////
+template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+class MPIArrayKokkos {
+
+    // this is manage
+    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
+    
+protected:
+    size_t dims_[7];
+    size_t length_;
+    size_t order_;  // tensor order (rank)
+    int mpi_recv_rank_;
+    int mpi_tag_;
+    MPI_Comm mpi_comm_;
+    MPI_Status mpi_status_;
+    MPI_Datatype mpi_datatype_;
+    MPI_Request mpi_request_;
+    TArray1D this_array_;
+    
+    void set_mpi_type();
+
+public:
+    // Data member to access host view
+    ViewCArray <T> host;
+
+    MPIArrayKokkos();
+    
+    MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2,
+                 size_t dim3, size_t dim4, size_t dim5,
+                 size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    // These functions can setup the data needed for halo send/receives
+    // Not necessary for standard MPI comms
+    void mpi_setup();
+
+    void mpi_setup(int recv_rank);
+
+    void mpi_setup(int recv_rank, int tag);
+
+    void mpi_setup(int recv_rank, int tag, MPI_Comm comm);
+
+    void mpi_set_rank(int recv_rank);
+
+    void mpi_set_tag(int tag);
+
+    void mpi_set_comm(MPI_Comm comm);
+
+    int get_rank();
+
+    int get_tag();
+
+    MPI_Comm get_comm();
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m,
+                  size_t n, size_t o) const;
+    
+    KOKKOS_INLINE_FUNCTION
+    MPIArrayKokkos& operator=(const MPIArrayKokkos& temp);
+
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
+
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t dims(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t order() const;
+ 
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
+
+    // Method returns the raw host pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* host_pointer() const;
+
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    TArray1D get_kokkos_dual_view() const;
+
+    // Method that update host view
+    void update_host();
+
+    // Method that update device view
+    void update_device();
+
+    // MPI send wrapper
+    void send(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI recieve wrapper
+    void recv(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI broadcast wrapper
+    void broadcast(size_t count, int root, MPI_Comm comm);
+
+    // MPI scatter wrapper
+    void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+
+    // MPI gather wrapper
+    void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm);
+
+    // MPI allgather wrapper
+    void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm);
+
+    // MPI send wrapper
+    void isend(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI recieve wrapper
+    void irecv(size_t count, int dest, int tag, MPI_Comm comm);
+
+    // MPI wait wrapper for sender
+    void wait_send();
+
+    // MPI wait wrapper for receiver
+    void wait_recv();
+
+    // MPI barrier wrapper
+    //void barrier(MPI_Comm comm);
+
+    // MPI send wrapper
+    void halo_send();
+
+    // MPI recieve wrapper
+    void halo_recv();
+
+    // MPI send wrapper
+    void halo_isend();
+
+    // MPI recieve wrapper
+    void halo_irecv();
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~MPIArrayKokkos ();
+}; // End of MPIArrayKokkos
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos() {
+    length_ = order_ = 0;
+    for (int i = 0; i < 7; i++) {
+        dims_[i] = 0;
+    }
+}
+
+// Overloaded 1D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    order_ = 1;
+    length_ = dim0;
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0);
+    set_mpi_type();
+}
+
+// Overloaded 2D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    order_ = 2;
+    length_ = (dim0 * dim1);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    order_ = 3;
+    length_ = (dim0 * dim1 * dim2);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    order_ = 4;
+    length_ = (dim0 * dim1 * dim2 * dim3);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3,
+                              size_t dim4, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    order_ = 5;
+    length_ = (dim0 * dim1 * dim2 * dim3 * dim4);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3,
+                              size_t dim4, size_t dim5, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    order_ = 6;
+    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::MPIArrayKokkos(size_t dim0, size_t dim1,
+                              size_t dim2, size_t dim3,
+                              size_t dim4, size_t dim5,
+                              size_t dim6, const std::string& tag_string) {
+    
+    dims_[0] = dim0;
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    dims_[6] = dim6;
+    order_ = 7;
+    length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6);
+    this_array_ = TArray1D(tag_string, length_);
+    // Create host ViewCArray
+    host = ViewCArray <T> (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6);
+    set_mpi_type();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
+    if (typeid(T).name() == typeid(bool).name()) {
+        mpi_datatype_ = MPI_C_BOOL;
+    }
+    else if (typeid(T).name() == typeid(int).name()) {
+        mpi_datatype_ = MPI_INT;
+    }
+    else if (typeid(T).name() == typeid(long int).name()) {
+        mpi_datatype_ = MPI_LONG;
+    }
+    else if (typeid(T).name() == typeid(long long int).name()) {
+        mpi_datatype_ = MPI_LONG_LONG_INT;
+    }
+    else if (typeid(T).name() == typeid(float).name()) {
+        mpi_datatype_ = MPI_FLOAT;
+    }
+    else if (typeid(T).name() == typeid(double).name()) {
+        mpi_datatype_ = MPI_DOUBLE;
+    }
+    else {
+        printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n");
+        mpi_datatype_ = MPI_INT;
+    }
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!");
+    return this_array_.d_view(i);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!");
+    return this_array_.d_view(j + (i * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!");
+    return this_array_.d_view(k + (j * dims_[2])
+                                + (i * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!");
+    return this_array_.d_view(l + (k * dims_[3])
+                                + (j * dims_[3] * dims_[2])
+                                + (i * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!");
+    return this_array_.d_view(m + (l * dims_[4])
+                                + (k * dims_[4] * dims_[3])
+                                + (j * dims_[4] * dims_[3] * dims_[2])
+                                + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!");
+    return this_array_.d_view(n + (m * dims_[5])
+                                + (l * dims_[5] * dims_[4])
+                                + (k * dims_[5] * dims_[4] * dims_[3])
+                                + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2])
+                                + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!");
+    assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!");
+    return this_array_.d_view(o + (n * dims_[6])
+                                + (m * dims_[6] * dims_[5])
+                                + (l * dims_[6] * dims_[5] * dims_[4])
+                                + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3])
+                                + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2])
+                                + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>& MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::operator= (const MPIArrayKokkos& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        for (int iter = 0; iter < temp.order_; iter++){
+            dims_[iter] = temp.dims_[iter];
+        } // end for
+
+        order_ = temp.order_;
+        length_ = temp.length_;
+        this_array_ = temp.this_array_;
+        host = temp.host;
+        mpi_recv_rank_ = temp.mpi_recv_rank_;
+        mpi_tag_ = temp.mpi_tag_;
+        mpi_comm_ = temp.mpi_comm_;
+        mpi_status_ = temp.mpi_status_;
+        mpi_datatype_ = temp.mpi_datatype_;
+        mpi_request_ = temp.mpi_request_;
+    }
+    
+    return *this;
+}
+
+// Return size
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!");
+    assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!");
+    return dims_[i];
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::order() const {
+    return order_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.d_view.data();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+    return this_array_.h_view.data();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+  return this_array_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+
+    this_array_.template modify<typename TArray1D::execution_space>();
+    this_array_.template sync<typename TArray1D::host_mirror_space>();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+
+    this_array_.template modify<typename TArray1D::host_mirror_space>();
+    this_array_.template sync<typename TArray1D::execution_space>();
+}
+
+// a default setup, should not be used except for testing
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup() {
+    mpi_recv_rank_ = 1;
+    mpi_tag_ = 99;
+    mpi_comm_ = MPI_COMM_WORLD;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank) {
+    mpi_recv_rank_ = recv_rank;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag) {
+    mpi_recv_rank_ = recv_rank;
+    mpi_tag_ = tag;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_setup(int recv_rank, int tag, MPI_Comm comm) {
+    mpi_recv_rank_ = recv_rank;
+    mpi_tag_ = tag;
+    mpi_comm_ = comm;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_rank(int recv_rank) {
+    mpi_recv_rank_ = recv_rank;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_tag(int tag) {
+    mpi_tag_ = tag;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::mpi_set_comm(MPI_Comm comm) {
+    mpi_comm_ = comm;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_rank() {
+    return mpi_recv_rank_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_tag() {
+    return mpi_tag_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+MPI_Comm MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::get_comm() {
+    return mpi_comm_;
+}
+
+//MPI_Send wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::send(size_t count, int dest, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); 
+#else
+    update_host();
+    MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); 
+#endif
+}
+
+//MPI_Recv wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::recv(size_t count, int source, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
+#else
+    MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); 
+    update_device();
+#endif
+}
+
+//MPI_Send halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_send() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
+#else
+    update_host();
+    MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); 
+#endif
+}
+
+//MPI_Recv halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_recv() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
+#else
+    MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); 
+    update_device();
+#endif
+}
+
+//MPI_iSend halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_isend() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#else
+    update_host();
+    MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#endif
+}
+
+//MPI_iRecv halo wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::halo_irecv() {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#else
+    MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); 
+#endif
+}
+
+//MPI_Bcast wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::broadcast(size_t count, int root, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); 
+#else
+    update_host();
+    MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); 
+    update_device();
+#endif
+}
+
+//MPI_Scatter wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
+#else
+    update_host();
+    MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
+    recv_buffer.update_device();
+#endif
+}
+
+//MPI_Gather wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); 
+#else
+    update_host();
+    MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); 
+    recv_buffer.update_device();
+#endif
+}
+
+//MPI_AllGather wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); 
+#else
+    update_host();
+    MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); 
+    recv_buffer.update_device();
+#endif
+}
+
+//MPI_Isend wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::isend(size_t count, int dest, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
+#else
+    update_host();
+    MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); 
+#endif
+}
+
+//MPI_Irecv wrapper
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::irecv(size_t count, int source, int tag, MPI_Comm comm) {
+#ifdef HAVE_GPU_AWARE_MPI
+    MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
+#else
+    MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); 
+#endif
+}
+
+//MPI_Wait wrapper for the sender
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_send() {
+    MPI_Wait(&mpi_request_, &mpi_status_); 
+}
+
+//MPI_Wait wrapper for the receiver
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::wait_recv() {
+    MPI_Wait(&mpi_request_, &mpi_status_); 
+#ifndef HAVE_GPU_AWARE_MPI
+    update_device();
+#endif
+}
+
+//MPI_Barrier wrapper
+//template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+//void MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
+//    MPI_Barrier(comm); 
+//}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPIArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPIArrayKokkos() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of MPIArrayKokkos
+////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace
+
+#endif // end if have MPI
+
+#endif // MPI_TYPES_H
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8f7fa4c2..e6c2bfaf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 project (matartest)
 
diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt
index 01cc23c0..a0e07edd 100644
--- a/test/test_cases/CMakeLists.txt
+++ b/test/test_cases/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1.3)
+cmake_minimum_required(VERSION 3.5)
 
 # Find all test files in the current directory except test_main.cpp
 file(GLOB TEST_SOURCES "test_*.cpp")