diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
index 0302f74..82b2227 100644
--- a/.github/workflows/pypi-wheels-gpu.yml
+++ b/.github/workflows/pypi-wheels-gpu.yml
@@ -31,7 +31,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-gpu-cuda12-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-v1
+          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-v2
 
       - name: Build GPU wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -42,7 +42,7 @@ jobs:
 
           # Use NVIDIA's CUDA-enabled manylinux image (CUDA 12.6, AlmaLinux 8)
           # This provides nvcc, CUDA runtime, and cuBLAS/cuSPARSE out of the box.
-          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux_2_28_x86_64_cuda_12.6
+          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux_2_34_x86_64_cuda_12.6
 
           # Build all dependencies with CUDA support.
           # HDF5 and libtiff are CPU-only (no GPU path needed).
@@ -50,11 +50,16 @@ jobs:
           # AMReX is built with -DAMReX_GPU_BACKEND=CUDA for device kernels.
           CIBW_BEFORE_ALL_LINUX: >
             dnf install -y epel-release &&
-            dnf --enablerepo=powertools install -y
+            dnf --enablerepo=crb install -y
             openmpi-devel gcc-gfortran gcc-c++ wget git
-            zlib-devel libjpeg-turbo-devel python3-pip &&
+            zlib-devel libjpeg-turbo-devel python3-pip
+            gcc-toolset-13 gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
+            gcc-toolset-13-gcc-gfortran &&
             pip3 install "cmake>=3.28,<4" &&
-            export PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH &&
+            export PATH=/opt/rh/gcc-toolset-13/root/usr/bin:/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH &&
+            export CC=/opt/rh/gcc-toolset-13/root/usr/bin/gcc &&
+            export CXX=/opt/rh/gcc-toolset-13/root/usr/bin/g++ &&
+            export FC=/opt/rh/gcc-toolset-13/root/usr/bin/gfortran &&
             export CUDA_HOME=/usr/local/cuda &&
             if [ -f /project/.cibw-deps-cache/deps.tar.gz ]; then
             echo "=== Restoring cached GPU dependencies ===" &&
@@ -92,7 +97,8 @@ jobs:
             --with-cuda-home=/usr/local/cuda --enable-shared=no
             CC=mpicc CXX=mpicxx FC=mpif90
             CFLAGS="-O2 -fPIC" CXXFLAGS="-O2 -fPIC" FFLAGS="-O2 -fPIC"
-            CUDA_HOME=/usr/local/cuda &&
+            CUDA_HOME=/usr/local/cuda
+            CUDAFLAGS="-allow-unsupported-compiler" &&
             make -j$(nproc) &&
             make install &&
             cd ../.. &&
@@ -107,26 +113,35 @@ jobs:
             -DAMReX_FORTRAN=ON
             -DAMReX_PARTICLES=OFF
             -DAMReX_GPU_BACKEND=CUDA
-            -DAMReX_CUDA_ARCH=60;70;75;80;86;89;90
+            '-DAMReX_CUDA_ARCH=60;70;75;80;86;89;90'
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_CUDA_ARCHITECTURES="60;70;75;80;86;89;90" &&
+            '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90'
+            -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++ &&
             cmake --build /tmp/amrex/build -j$(nproc) &&
             cmake --install /tmp/amrex/build &&
             mkdir -p /project/.cibw-deps-cache &&
             tar czf /project/.cibw-deps-cache/deps.tar.gz /usr/local ;
             fi
 
-          CIBW_BEFORE_BUILD: pip install "cmake>=3.28,<4"
+          # Rename the package to openimpala-cuda for the GPU wheel.
+          # The import name stays 'openimpala' — only the PyPI distribution name changes.
+          CIBW_BEFORE_BUILD: >
+            pip install "cmake>=3.28,<4" &&
+            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml
 
           # Point to MPI, CUDA, and our compiled GPU dependencies.
           CIBW_ENVIRONMENT_LINUX: >
-            PATH="/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH"
+            PATH="/opt/rh/gcc-toolset-13/root/usr/bin:/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH"
+            CC="/opt/rh/gcc-toolset-13/root/usr/bin/gcc"
+            CXX="/opt/rh/gcc-toolset-13/root/usr/bin/g++"
+            FC="/opt/rh/gcc-toolset-13/root/usr/bin/gfortran"
             CUDA_HOME="/usr/local/cuda"
+            CUDAHOSTCXX="/opt/rh/gcc-toolset-13/root/usr/bin/g++"
             CMAKE_C_COMPILER="mpicc"
             CMAKE_CXX_COMPILER="mpicxx"
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
-            CMAKE_ARGS="-DGPU_BACKEND=CUDA -DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90"
+            CMAKE_ARGS="-DGPU_BACKEND=CUDA '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++"
 
           # Vendor libraries but exclude host-specific MPI, OpenMP, Fortran runtime,
           # and CUDA runtime libraries (users must have CUDA toolkit installed).
diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb
index b130165..83866cc 100644
--- a/notebooks/profiling_and_tuning.ipynb
+++ b/notebooks/profiling_and_tuning.ipynb
@@ -75,7 +75,7 @@
    "source": [
     "# Install system MPI and Python packages\n",
     "!apt-get install -y libopenmpi-dev > /dev/null 2>&1\n",
-    "!pip install openimpala[all] > /dev/null 2>&1\n",
+    "!pip install openimpala-cuda[all] > /dev/null 2>&1\n",
     "!pip install porespy > /dev/null 2>&1\n",
     "print(\"Dependencies installed.\")"
    ]
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 42243a7..abe260e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -21,7 +21,7 @@ find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
 find_package(pybind11 2.11 CONFIG REQUIRED)
 
 # --- Build the extension module ---
-pybind11_add_module(_core
+set(BINDING_SOURCES
     bindings/module.cpp
     bindings/enums.cpp
     bindings/io.cpp
@@ -29,6 +29,13 @@ pybind11_add_module(_core
     bindings/solvers.cpp
     bindings/config.cpp
 )
+pybind11_add_module(_core ${BINDING_SOURCES})
+
+# When CUDA is enabled, AMReX headers contain __host__/__device__ attributes
+# that require compilation by nvcc.
+if(GPU_BACKEND STREQUAL "CUDA")
+    set_source_files_properties(${BINDING_SOURCES} PROPERTIES LANGUAGE CUDA)
+endif()
 
 target_link_libraries(_core PRIVATE
     openimpala_io
diff --git a/python/openimpala/__init__.py b/python/openimpala/__init__.py
index 105ef07..18fe931 100644
--- a/python/openimpala/__init__.py
+++ b/python/openimpala/__init__.py
@@ -27,7 +27,10 @@
 try:
     __version__ = version("openimpala")
 except PackageNotFoundError:
-    __version__ = "unknown"
+    try:
+        __version__ = version("openimpala-cuda")
+    except PackageNotFoundError:
+        __version__ = "unknown"
 
 # Session context manager (pure Python — always available)
 from .session import Session
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 91ce955..597eb25 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,7 +7,7 @@
 # IO Library
 # ==============================================================================
 # Library sources (exclude test drivers that start with 't')
-add_library(openimpala_io OBJECT
+set(IO_SOURCES
     io/CathodeWrite.cpp
     io/DatReader.cpp
     io/HDF5Reader.cpp
@@ -15,6 +15,7 @@ add_library(openimpala_io OBJECT
     io/RawReader.cpp
     io/TiffReader.cpp
 )
+add_library(openimpala_io OBJECT ${IO_SOURCES})
 
 target_include_directories(openimpala_io PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/io
@@ -36,7 +37,7 @@ target_include_directories(openimpala_io PUBLIC
 # ==============================================================================
 # Props Library (C++ only — Fortran kernels migrated to native C++)
 # ==============================================================================
-add_library(openimpala_props OBJECT
+set(PROPS_SOURCES
     props/ConnectedComponents.cpp
     props/DeffTensor.cpp
     props/EffectiveDiffusivityHypre.cpp
@@ -53,6 +54,7 @@ add_library(openimpala_props OBJECT
     props/TortuositySolverBase.cpp
     props/VolumeFraction.cpp
 )
+add_library(openimpala_props OBJECT ${PROPS_SOURCES})
 
 target_include_directories(openimpala_props PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/props
@@ -101,5 +103,19 @@ target_include_directories(Diffusion PRIVATE
     ${HDF5_INCLUDE_DIRS}
 )
 
+# ==============================================================================
+# CUDA: compile all C++ sources as CUDA when GPU backend is enabled
+# ==============================================================================
+# AMReX headers use __host__/__device__ attributes (AMREX_GPU_HOST_DEVICE)
+# when built with CUDA. Any translation unit that includes AMReX headers must
+# therefore be compiled by nvcc. We achieve this by setting the LANGUAGE
+# property to CUDA on all .cpp source files.
+if(GPU_BACKEND STREQUAL "CUDA")
+    set_source_files_properties(
+        ${IO_SOURCES} ${PROPS_SOURCES} props/Diffusion.cpp
+        PROPERTIES LANGUAGE CUDA
+    )
+endif()
+
 # Install the main executable
 install(TARGETS Diffusion RUNTIME DESTINATION bin)
diff --git a/src/io/HDF5Reader.H b/src/io/HDF5Reader.H
index 4af9eb6..579ad96 100644
--- a/src/io/HDF5Reader.H
+++ b/src/io/HDF5Reader.H
@@ -160,6 +160,14 @@ public:
     }
 
 
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
+    // Template function to read a hyperslab and apply thresholding
+    // Needed because the native type read from file varies.
+    template <typename T_Native>
+    void readAndThresholdFab(H5::DataSet& dataset, double raw_threshold, int value_if_true,
+                             int value_if_false, const amrex::Box& box,
+                             amrex::IArrayBox& fab) const;
+
 private:
     /**
      * @brief Internal implementation for reading HDF5 metadata only.
@@ -169,13 +177,6 @@ private:
      */
     bool readMetadataInternal();
 
-    // Template function to read a hyperslab and apply thresholding
-    // Needed because the native type read from file varies.
-    template <typename T_Native>
-    void readAndThresholdFab(H5::DataSet& dataset, double raw_threshold, int value_if_true,
-                             int value_if_false, const amrex::Box& box,
-                             amrex::IArrayBox& fab) const;
-
     // --- Member Variables ---
     std::string m_filename;     /**< Filename of the source HDF5 file */
     std::string m_hdf5dataset;  /**< Path to the dataset within the HDF5 file */
diff --git a/src/props/ConnectedComponents.H b/src/props/ConnectedComponents.H
index a879cd7..b8bf127 100644
--- a/src/props/ConnectedComponents.H
+++ b/src/props/ConnectedComponents.H
@@ -59,12 +59,13 @@ public:
         return m_labels;
     }
 
-private:
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
     void run(const amrex::iMultiFab& mf_phase, int phase_id);
 
     amrex::IntVect findNextUnlabeled(const amrex::iMultiFab& labelMF,
                                      const amrex::iMultiFab& phaseFab, int phaseID) const;
 
+private:
     amrex::Geometry m_geom;
     amrex::BoxArray m_ba;
     amrex::DistributionMapping m_dm;
diff --git a/src/props/ConnectedComponents.cpp b/src/props/ConnectedComponents.cpp
index 96864f7..c424ec0 100644
--- a/src/props/ConnectedComponents.cpp
+++ b/src/props/ConnectedComponents.cpp
@@ -108,8 +108,8 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) {
     // Compute volume of each component using GPU-compatible atomic scatter-add
     m_volumes.resize(m_num_components, 0);
 
-    amrex::Gpu::DeviceVector<long long> d_volumes(m_num_components, 0);
-    long long* d_vol_ptr = d_volumes.data();
+    amrex::Gpu::DeviceVector<int> d_volumes(m_num_components, 0);
+    int* d_vol_ptr = d_volumes.data();
     const int num_comp = m_num_components;
 
     for (amrex::MFIter mfi(m_labels); mfi.isValid(); ++mfi) {
@@ -119,12 +119,12 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) {
         amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
             int lbl = label_arr(i, j, k, 0);
             if (lbl > 0 && lbl <= num_comp) {
-                amrex::Gpu::Atomic::Add(&d_vol_ptr[lbl - 1], 1LL);
+                amrex::Gpu::Atomic::Add(&d_vol_ptr[lbl - 1], 1);
             }
         });
     }
 
-    std::vector<long long> local_volumes(m_num_components);
+    std::vector<int> local_volumes(m_num_components);
     amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_volumes.begin(), d_volumes.end(),
                      local_volumes.begin());
 
@@ -132,7 +132,7 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) {
         amrex::ParallelAllReduce::Sum(local_volumes.data(), m_num_components,
                                       amrex::ParallelContext::CommunicatorSub());
     }
-    m_volumes = local_volumes;
+    m_volumes.assign(local_volumes.begin(), local_volumes.end());
 }
 
 } // namespace OpenImpala
diff --git a/src/props/EffectiveDiffusivityHypre.H b/src/props/EffectiveDiffusivityHypre.H
index 5381ce1..26a3255 100644
--- a/src/props/EffectiveDiffusivityHypre.H
+++ b/src/props/EffectiveDiffusivityHypre.H
@@ -99,6 +99,9 @@ public:
     void getChiSolution(amrex::MultiFab& chi_field);
 
 
+    // --- Public for CUDA __device__ lambda compatibility ---
+    void initializeDiffCoeff();
+
     // --- Public Getters for Status and Solver Information ---
     // Solver statistics are inherited from HypreStructSolver:
     //   getSolverConverged(), getFinalRelativeResidualNorm(), getSolverIterations()
diff --git a/src/props/EffectiveDiffusivityHypre.cpp b/src/props/EffectiveDiffusivityHypre.cpp
index 38ee8e8..7218f7f 100644
--- a/src/props/EffectiveDiffusivityHypre.cpp
+++ b/src/props/EffectiveDiffusivityHypre.cpp
@@ -164,31 +164,7 @@ EffectiveDiffusivityHypre::EffectiveDiffusivityHypre(
 
     // Build coefficient MultiFab using a device-accessible lookup table
     m_mf_diff_coeff.setVal(0.0);
-    {
-        int max_pid = 0;
-        for (const auto& kv : m_phase_coeff_map) {
-            max_pid = std::max(max_pid, kv.first);
-        }
-        amrex::Gpu::DeviceVector<amrex::Real> d_coeff_lut(max_pid + 1, 0.0);
-        amrex::Gpu::HostVector<amrex::Real> h_coeff_lut(max_pid + 1, 0.0);
-        for (const auto& kv : m_phase_coeff_map) {
-            h_coeff_lut[kv.first] = kv.second;
-        }
-        amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(),
-                         d_coeff_lut.begin());
-        const amrex::Real* lut_ptr = d_coeff_lut.data();
-        const int lut_size = max_pid + 1;
-
-        for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
-            const amrex::Box& bx = mfi.growntilebox();
-            amrex::Array4<amrex::Real> const dc_arr = m_mf_diff_coeff.array(mfi);
-            amrex::Array4<const int> const phase_arr = m_mf_phase_original.const_array(mfi);
-            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
-                int pid = phase_arr(i, j, k, 0);
-                dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0;
-            });
-        }
-    }
+    initializeDiffCoeff();
     m_mf_diff_coeff.FillBoundary(m_geom.periodicity());
 
     m_mf_active_mask.setVal(cell_inactive);
@@ -235,6 +211,32 @@ EffectiveDiffusivityHypre::EffectiveDiffusivityHypre(
 
 // Destructor is defaulted in the header — base class handles HYPRE cleanup.
 
+void EffectiveDiffusivityHypre::initializeDiffCoeff() {
+    int max_pid = 0;
+    for (const auto& kv : m_phase_coeff_map) {
+        max_pid = std::max(max_pid, kv.first);
+    }
+    amrex::Gpu::DeviceVector<amrex::Real> d_coeff_lut(max_pid + 1, 0.0);
+    amrex::Gpu::HostVector<amrex::Real> h_coeff_lut(max_pid + 1, 0.0);
+    for (const auto& kv : m_phase_coeff_map) {
+        h_coeff_lut[kv.first] = kv.second;
+    }
+    amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(),
+                     d_coeff_lut.begin());
+    const amrex::Real* lut_ptr = d_coeff_lut.data();
+    const int lut_size = max_pid + 1;
+
+    for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+        const amrex::Box& bx = mfi.growntilebox();
+        amrex::Array4<amrex::Real> const dc_arr = m_mf_diff_coeff.array(mfi);
+        amrex::Array4<const int> const phase_arr = m_mf_phase_original.const_array(mfi);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            int pid = phase_arr(i, j, k, 0);
+            dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0;
+        });
+    }
+}
+
 void EffectiveDiffusivityHypre::generateActiveMask() {
     BL_PROFILE("EffectiveDiffusivityHypre::generateActiveMask");
 
@@ -711,7 +713,8 @@ void EffectiveDiffusivityHypre::getChiSolution(amrex::MultiFab& chi_field) {
                                                             soln_buffer.data());
         if (get_ierr != 0) {
             amrex::Warning("HYPRE_StructVectorGetBoxValues failed during getChiSolution!");
-            chi_field[mfi].setVal(0.0, bx_getsol, ChiComp, numComponentsChi);
+            chi_field[mfi].template setVal<amrex::RunOn::Host>(0.0, bx_getsol, ChiComp,
+                                                               numComponentsChi);
             continue;
         }
 
diff --git a/src/props/FloodFill.cpp b/src/props/FloodFill.cpp
index 5f30766..7cf2c49 100644
--- a/src/props/FloodFill.cpp
+++ b/src/props/FloodFill.cpp
@@ -177,7 +177,10 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa
 
         // Reset device flag
 #ifdef AMREX_USE_GPU
-        d_changed = amrex::Gpu::DeviceScalar<int>(0);
+        {
+            int zero = 0;
+            amrex::Gpu::htod_memcpy(d_changed.dataPtr(), &zero, sizeof(int));
+        }
         d_flag_ptr = d_changed.dataPtr();
 #else
         h_changed = 0;
diff --git a/src/props/PercolationCheck.H b/src/props/PercolationCheck.H
index 5d27f0a..f67a3a8 100644
--- a/src/props/PercolationCheck.H
+++ b/src/props/PercolationCheck.H
@@ -56,9 +56,10 @@ public:
     /** @brief Returns a human-readable direction string ("X", "Y", or "Z"). */
     static std::string directionString(OpenImpala::Direction dir);
 
-private:
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
     void run(const amrex::iMultiFab& mf_phase, int phase_id, OpenImpala::Direction dir);
 
+private:
     amrex::Geometry m_geom;
     amrex::BoxArray m_ba;
     amrex::DistributionMapping m_dm;
diff --git a/src/props/REVStudy.cpp b/src/props/REVStudy.cpp
index 2d4ae7d..20f2aa9 100644
--- a/src/props/REVStudy.cpp
+++ b/src/props/REVStudy.cpp
@@ -131,7 +131,7 @@ void runREVStudy(const amrex::Geometry& geom_full, const amrex::BoxArray& ba_ful
                 const amrex::Box& dest_box = mfi.validbox();
                 amrex::Box src_box = dest_box;
                 src_box.shift(bx_rev.smallEnd());
-                dest_fab.copy(src_fab, src_box, 0, dest_box, 0, 1);
+                dest_fab.template copy<amrex::RunOn::Host>(src_fab, src_box, 0, dest_box, 0, 1);
             }
             mf_phase_rev.FillBoundary(geom_rev.periodicity());
 
diff --git a/src/props/ThroughThicknessProfile.H b/src/props/ThroughThicknessProfile.H
index 2d200c2..a4cc428 100644
--- a/src/props/ThroughThicknessProfile.H
+++ b/src/props/ThroughThicknessProfile.H
@@ -54,6 +54,10 @@ public:
         return static_cast<int>(m_vf_profile.size());
     }
 
+    /** @brief Compute the profile (public for CUDA __device__ lambda compatibility). */
+    void compute(const amrex::Geometry& geom, const amrex::iMultiFab& mf_phase, int phase_id,
+                 OpenImpala::Direction dir, int comp);
+
 private:
     std::vector<amrex::Real> m_vf_profile;
 };
diff --git a/src/props/ThroughThicknessProfile.cpp b/src/props/ThroughThicknessProfile.cpp
index 81bb6b3..8e6c24c 100644
--- a/src/props/ThroughThicknessProfile.cpp
+++ b/src/props/ThroughThicknessProfile.cpp
@@ -12,6 +12,11 @@ namespace OpenImpala {
 ThroughThicknessProfile::ThroughThicknessProfile(const amrex::Geometry& geom,
                                                  const amrex::iMultiFab& mf_phase, int phase_id,
                                                  OpenImpala::Direction dir, int comp) {
+    compute(geom, mf_phase, phase_id, dir, comp);
+}
+
+void ThroughThicknessProfile::compute(const amrex::Geometry& geom, const amrex::iMultiFab& mf_phase,
+                                      int phase_id, OpenImpala::Direction dir, int comp) {
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(comp >= 0 && comp < mf_phase.nComp(),
                                      "ThroughThicknessProfile: Component index out of bounds.");
 
@@ -20,10 +25,10 @@ ThroughThicknessProfile::ThroughThicknessProfile(const amrex::Geometry& geom,
     const int n_slices = domain.length(idir);
 
     // GPU-compatible per-slice accumulation using device vectors with atomic scatter-add
-    amrex::Gpu::DeviceVector<long long> d_slice_phase(n_slices, 0);
-    amrex::Gpu::DeviceVector<long long> d_slice_total(n_slices, 0);
-    long long* d_phase_ptr = d_slice_phase.data();
-    long long* d_total_ptr = d_slice_total.data();
+    amrex::Gpu::DeviceVector<int> d_slice_phase(n_slices, 0);
+    amrex::Gpu::DeviceVector<int> d_slice_total(n_slices, 0);
+    int* d_phase_ptr = d_slice_phase.data();
+    int* d_total_ptr = d_slice_total.data();
 
     const int target_phase = phase_id;
     const int phase_comp = comp;
@@ -36,16 +41,16 @@ ThroughThicknessProfile::ThroughThicknessProfile(const amrex::Geometry& geom,
         amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
             int coord = (idir == 0) ? i : (idir == 1) ? j : k;
             int idx = coord - slice_lo;
-            amrex::Gpu::Atomic::Add(&d_total_ptr[idx], 1LL);
+            amrex::Gpu::Atomic::Add(&d_total_ptr[idx], 1);
             if (fab(i, j, k) == target_phase) {
-                amrex::Gpu::Atomic::Add(&d_phase_ptr[idx], 1LL);
+                amrex::Gpu::Atomic::Add(&d_phase_ptr[idx], 1);
             }
         });
     }
 
     // Copy device results to host
-    std::vector<long long> slice_phase(n_slices);
-    std::vector<long long> slice_total(n_slices);
+    std::vector<int> slice_phase(n_slices);
+    std::vector<int> slice_total(n_slices);
     amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_slice_phase.begin(), d_slice_phase.end(),
                      slice_phase.begin());
     amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_slice_total.begin(), d_slice_total.end(),
diff --git a/src/props/TortuosityDirect.H b/src/props/TortuosityDirect.H
index e7728d0..9f6678c 100644
--- a/src/props/TortuosityDirect.H
+++ b/src/props/TortuosityDirect.H
@@ -116,73 +116,15 @@ public:
     }
 
 
-private:
-    /** @brief Executes the iterative solver loop until convergence or max iterations. Returns true
-     * if converged. */
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
     bool solve();
-
-    /**
-     * @brief Computes the global fluxes entering and exiting the domain boundaries.
-     * Expected to be called after solve() has successfully completed.
-     * @param fxin Output: Total flux entering the domain through the low boundary face.
-     * @param fxout Output: Total flux exiting the domain through the high boundary face.
-     */
     void global_fluxes(amrex::Real& fxin, amrex::Real& fxout) const;
-
-    /**
-     * @brief Computes a residual norm (e.g., L1 norm of change) between two solution states.
-     * Used as a convergence check in the iterative solver.
-     * @param phiold MultiFab containing the solution from the previous iteration.
-     * @param phinew MultiFab containing the solution from the current iteration.
-     * @return The calculated residual norm.
-     */
     amrex::Real residual(const amrex::MultiFab& phiold, const amrex::MultiFab& phinew) const;
-
-    /** @brief Sets up the m_bc member variable describing boundary condition types (e.g.,
-     * Dirichlet, Neumann). */
-    // void initializeBoundaryConditions() ; // FIX 2: Corrected spelling
-    void initializeBoundaryConditions(); // <<< FIX 2: Corrected spelling >>>
-
-    /** @brief Creates and initializes the m_flux MultiFab array for storing face fluxes. */
-    // void initializeFluxMultiFabs();    // FIX 2: Corrected spelling
-    void initializeFluxMultiFabs(); // <<< FIX 2: Corrected spelling >>>
-
-    /**
-     * @brief Applies boundary conditions to the ghost cells of the potential MultiFab for a
-     * specific component. Uses the geometry and m_bc settings. Primarily handles external
-     * Dirichlet.
-     * @param phi The MultiFab whose ghost cells need filling based on BCs.
-     * @param comp The component index to fill boundary conditions for.
-     */
+    void initializeBoundaryConditions();
+    void initializeFluxMultiFabs();
     void fillDomainBoundary(amrex::MultiFab& phi, int comp);
-
-
-    /**
-     * @brief Fills a component of a MultiFab with CellType information (e.g., 0=blocked, 1=free).
-     * Based on the input phase data (m_mf_phase).
-     * @param phi The MultiFab to fill (typically fills component comp_ct).
-     * Requires at least 2 components.
-     */
-    void fillCellTypes(amrex::MultiFab& phi); // Fills component comp_ct
-
-    /**
-     * @brief Sets the initial guess for the potential field (phi component).
-     * Often a linear interpolation between boundary values (m_vlo, m_vhi) in the specified
-     * direction.
-     * @param phi The MultiFab to fill with the initial state (modifies comp_phi).
-     */
-    void fillInitialState(amrex::MultiFab& phi); // Fills component comp_phi
-
-
-    /**
-     * @brief Performs one iteration of the Finite Volume solver (e.g., Jacobi, SOR).
-     * Calculates face fluxes (updates m_flux) based on phi_old.
-     * Updates the potential field phi_new based on conservation laws using calculated fluxes.
-     * @param phi_old Input: Potential field from the previous iteration (requires filled ghost
-     * cells).
-     * @param phi_new Output: Updated potential field for the current iteration.
-     * @param dt Timestep or relaxation parameter used in the update.
-     */
+    void fillCellTypes(amrex::MultiFab& phi);
+    void fillInitialState(amrex::MultiFab& phi);
     void advance(amrex::MultiFab& phi_old, amrex::MultiFab& phi_new, const amrex::Real& dt);
 
 
diff --git a/src/props/TortuosityHypre.H b/src/props/TortuosityHypre.H
index a3e5f4b..ac99b57 100644
--- a/src/props/TortuosityHypre.H
+++ b/src/props/TortuosityHypre.H
@@ -152,10 +152,11 @@ public:
     }
 
 
-private:
-    // --- Private Methods ---
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
     bool solve();
     void setupMatrixEquation();
+    void initializeDiffCoeff();
+    amrex::iMultiFab buildTraversableMask();
     void preconditionPhaseFab();
     void generateActivityMask(const amrex::iMultiFab& phaseFab, int phaseID,
                               OpenImpala::Direction dir);
@@ -164,6 +165,7 @@ private:
     void global_fluxes();
     void computePlaneFluxes(const amrex::MultiFab& mf_soln);
 
+private:
     // --- Member Variables ---
     // Configuration (solver config m_solvertype/m_eps/m_maxiter/m_verbose are in base class)
     std::string m_resultspath;
diff --git a/src/props/TortuosityHypre.cpp b/src/props/TortuosityHypre.cpp
index 796921f..777ebee 100644
--- a/src/props/TortuosityHypre.cpp
+++ b/src/props/TortuosityHypre.cpp
@@ -216,32 +216,7 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const
     if (m_verbose > 1 && amrex::ParallelDescriptor::IOProcessor())
         amrex::Print() << "TortuosityHypre: Building diffusion coefficient field..." << std::endl;
     m_mf_diff_coeff.setVal(0.0);
-    // Flatten phase coefficient map to a device-accessible lookup table
-    {
-        int max_pid = 0;
-        for (const auto& kv : m_phase_coeff_map) {
-            max_pid = std::max(max_pid, kv.first);
-        }
-        amrex::Gpu::DeviceVector<amrex::Real> d_coeff_lut(max_pid + 1, 0.0);
-        amrex::Gpu::HostVector<amrex::Real> h_coeff_lut(max_pid + 1, 0.0);
-        for (const auto& kv : m_phase_coeff_map) {
-            h_coeff_lut[kv.first] = kv.second;
-        }
-        amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(),
-                         d_coeff_lut.begin());
-        const amrex::Real* lut_ptr = d_coeff_lut.data();
-        const int lut_size = max_pid + 1;
-
-        for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
-            const amrex::Box& bx = mfi.growntilebox();
-            amrex::Array4<amrex::Real> const dc_arr = m_mf_diff_coeff.array(mfi);
-            amrex::Array4<const int> const phase_arr = m_mf_phase.const_array(mfi);
-            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
-                int pid = phase_arr(i, j, k, 0);
-                dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0;
-            });
-        }
-    }
+    initializeDiffCoeff();
     m_mf_diff_coeff.FillBoundary(m_geom.periodicity());
 
     // --- For multi-phase: create binary traversable mask for flood fill ---
@@ -252,19 +227,7 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const
                        << std::endl;
 
     if (m_is_multi_phase) {
-        // Create a temporary binary phase fab: 1 where D > 0, 0 otherwise
-        amrex::iMultiFab mf_binary_traversable(m_ba, m_dm, 1, m_mf_phase.nGrow());
-        mf_binary_traversable.setVal(0);
-        for (amrex::MFIter mfi(mf_binary_traversable, amrex::TilingIfNotGPU()); mfi.isValid();
-             ++mfi) {
-            const amrex::Box& bx = mfi.growntilebox();
-            amrex::Array4<int> const trav_arr = mf_binary_traversable.array(mfi);
-            amrex::Array4<const amrex::Real> const dc_arr = m_mf_diff_coeff.const_array(mfi);
-            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
-                trav_arr(i, j, k, 0) = (dc_arr(i, j, k, 0) > 0.0) ? 1 : 0;
-            });
-        }
-        mf_binary_traversable.FillBoundary(m_geom.periodicity());
+        amrex::iMultiFab mf_binary_traversable = buildTraversableMask();
         // Flood through all traversable cells (phase ID = 1 in binary fab)
         generateActivityMask(mf_binary_traversable, 1, m_dir);
     } else {
@@ -303,6 +266,48 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const
 
 // Destructor is defaulted — HypreStructSolver base class handles HYPRE cleanup.
 
+void TortuosityHypre::initializeDiffCoeff() {
+    // Flatten phase coefficient map to a device-accessible lookup table
+    int max_pid = 0;
+    for (const auto& kv : m_phase_coeff_map) {
+        max_pid = std::max(max_pid, kv.first);
+    }
+    amrex::Gpu::DeviceVector<amrex::Real> d_coeff_lut(max_pid + 1, 0.0);
+    amrex::Gpu::HostVector<amrex::Real> h_coeff_lut(max_pid + 1, 0.0);
+    for (const auto& kv : m_phase_coeff_map) {
+        h_coeff_lut[kv.first] = kv.second;
+    }
+    amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(),
+                     d_coeff_lut.begin());
+    const amrex::Real* lut_ptr = d_coeff_lut.data();
+    const int lut_size = max_pid + 1;
+
+    for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+        const amrex::Box& bx = mfi.growntilebox();
+        amrex::Array4<amrex::Real> const dc_arr = m_mf_diff_coeff.array(mfi);
+        amrex::Array4<const int> const phase_arr = m_mf_phase.const_array(mfi);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            int pid = phase_arr(i, j, k, 0);
+            dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0;
+        });
+    }
+}
+
+amrex::iMultiFab TortuosityHypre::buildTraversableMask() {
+    // Create a temporary binary phase fab: 1 where D > 0, 0 otherwise
+    amrex::iMultiFab mf_binary_traversable(m_ba, m_dm, 1, m_mf_phase.nGrow());
+    mf_binary_traversable.setVal(0);
+    for (amrex::MFIter mfi(mf_binary_traversable, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+        const amrex::Box& bx = mfi.growntilebox();
+        amrex::Array4<int> const trav_arr = mf_binary_traversable.array(mfi);
+        amrex::Array4<const amrex::Real> const dc_arr = m_mf_diff_coeff.const_array(mfi);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            trav_arr(i, j, k, 0) = (dc_arr(i, j, k, 0) > 0.0) ? 1 : 0;
+        });
+    }
+    mf_binary_traversable.FillBoundary(m_geom.periodicity());
+    return mf_binary_traversable;
+}
 
 // setupGrid() and setupStencil() are now provided by HypreStructSolver base class.
 
diff --git a/src/props/TortuosityMLMG.H b/src/props/TortuosityMLMG.H
index 2e1c1c4..6bdb198 100644
--- a/src/props/TortuosityMLMG.H
+++ b/src/props/TortuosityMLMG.H
@@ -70,11 +70,11 @@ public:
     TortuosityMLMG(const TortuosityMLMG&) = delete;
     TortuosityMLMG& operator=(const TortuosityMLMG&) = delete;
 
-protected:
     /** @brief Run the MLMG solver, populating m_mf_solution.
      *
      *  Sets up MLABecLaplacian with appropriate BCs and B-coefficients,
      *  runs MLMG V-cycle solve, and optionally writes a plotfile.
+     *  Public for CUDA __device__ lambda compatibility.
      *
      *  @return true if solver converged.
      */
diff --git a/src/props/TortuositySolverBase.H b/src/props/TortuositySolverBase.H
index 473ac9f..c1607fa 100644
--- a/src/props/TortuositySolverBase.H
+++ b/src/props/TortuositySolverBase.H
@@ -112,7 +112,9 @@ public:
         return m_phase_coeff_map;
     }
 
-protected:
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
+    // These methods contain GPU device lambdas and must be publicly accessible for nvcc.
+
     /** @brief Run the linear solver, populating m_mf_solution.
      *
      *  Subclasses must:
@@ -137,6 +139,7 @@ protected:
     void computePlaneFluxes(const amrex::MultiFab& mf_soln);
     void writeSolutionPlotfile(const std::string& label);
 
+protected:
     // --- Grid data (stored by value to avoid dangling references) ---
     amrex::Geometry m_geom;
     amrex::BoxArray m_ba;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 524000a..09d6d5d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -29,6 +29,10 @@ set(OPENIMPALA_TEST_INCLUDES
 # Helper: define a test executable and register it with CTest.
 # If an inputs file exists in tests/inputs/<name>.inputs, it is passed as an argument.
 function(openimpala_add_test TEST_NAME SOURCE_FILE)
+    # When CUDA is enabled, AMReX headers require nvcc compilation
+    if(GPU_BACKEND STREQUAL "CUDA")
+        set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE CUDA)
+    endif()
     add_executable(${TEST_NAME} ${SOURCE_FILE})
     target_link_libraries(${TEST_NAME} PRIVATE ${OPENIMPALA_TEST_LIBS})
     target_include_directories(${TEST_NAME} PRIVATE ${OPENIMPALA_TEST_INCLUDES})
@@ -421,6 +425,49 @@ set_tests_properties(tDiffusion_hdf5 PROPERTIES
     TIMEOUT 600
 )
 
+# ==============================================================================
+# FloodFill Tests (Issue #170 — shared GPU-compatible flood fill utility)
+# ==============================================================================
+openimpala_add_test(tFloodFill  "${CMAKE_CURRENT_SOURCE_DIR}/tFloodFill.cpp")
+
+# ==============================================================================
+# TortuosityMLMG Tests (Issue #171 — matrix-free MLMG solver)
+#
+# Validates the AMReX-native MLMG solver against the same analytical
+# solutions used for the HYPRE-based solver.
+# ==============================================================================
+openimpala_add_test(tTortuosityMLMG  "${CMAKE_CURRENT_SOURCE_DIR}/tTortuosityMLMG.cpp")
+
+# MLMG uniform block: tau = (N-1)/N = 0.96875
+add_test(
+    NAME tTortuosityMLMG_uniform
+    COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 1
+            ${MPIEXEC_PREFLAGS}
+            $<TARGET_FILE:tTortuosityMLMG>
+            ${CMAKE_SOURCE_DIR}/tests/inputs/tTortuosityMLMG_uniform.inputs
+            amrex.verbose=0
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+)
+set_tests_properties(tTortuosityMLMG_uniform PROPERTIES
+    ENVIRONMENT "OMPI_ALLOW_RUN_AS_ROOT=1;OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1"
+    TIMEOUT 300
+)
+
+# MLMG Y-direction symmetry: same tau as X
+add_test(
+    NAME tTortuosityMLMG_dirY
+    COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 1
+            ${MPIEXEC_PREFLAGS}
+            $<TARGET_FILE:tTortuosityMLMG>
+            ${CMAKE_SOURCE_DIR}/tests/inputs/tTortuosityMLMG_dirY.inputs
+            amrex.verbose=0
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+)
+set_tests_properties(tTortuosityMLMG_dirY PROPERTIES
+    ENVIRONMENT "OMPI_ALLOW_RUN_AS_ROOT=1;OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1"
+    TIMEOUT 300
+)
+
 # ==============================================================================
 # Unit Tests (Catch2 — fast, no MPI/AMReX required)
 # ==============================================================================
diff --git a/tests/inputs/tTortuosityMLMG_dirY.inputs b/tests/inputs/tTortuosityMLMG_dirY.inputs
new file mode 100644
index 0000000..a6d51ee
--- /dev/null
+++ b/tests/inputs/tTortuosityMLMG_dirY.inputs
@@ -0,0 +1,21 @@
+# TortuosityMLMG Test: Y Direction
+#
+# Same as uniform test but in Y direction — validates directional symmetry.
+
+domain_size = 32
+box_size = 16
+verbose = 2
+num_phases_fill = 1
+direction = Y
+
+expected_tau = 1.0
+tau_tolerance = 0.001
+
+resultsdir = ./tTortuosityMLMG_dirY_results
+
+mlmg.eps = 1e-9
+mlmg.maxiter = 200
+
+tortuosity.active_phases = 0
+tortuosity.phase_diffusivities = 1.0
+tortuosity.remspot_passes = 0
diff --git a/tests/inputs/tTortuosityMLMG_uniform.inputs b/tests/inputs/tTortuosityMLMG_uniform.inputs
new file mode 100644
index 0000000..60d4478
--- /dev/null
+++ b/tests/inputs/tTortuosityMLMG_uniform.inputs
@@ -0,0 +1,27 @@
+# TortuosityMLMG Test: Uniform Dense Block
+#
+# Geometry:  32^3 domain, all voxels = single conducting phase (D=1.0)
+# Analytical: MLMG applies Dirichlet BCs at the domain face (external boundary),
+#             not at cell centers like HYPRE. For a uniform medium this gives
+#             tau = 1.0 exactly.
+# Validates: TortuosityMLMG solver + TortuositySolverBase infrastructure
+
+domain_size = 32
+box_size = 16
+verbose = 2
+num_phases_fill = 1
+direction = X
+
+expected_tau = 1.0
+tau_tolerance = 0.001
+
+resultsdir = ./tTortuosityMLMG_uniform_results
+
+# --- MLMG Solver Controls ---
+mlmg.eps = 1e-9
+mlmg.maxiter = 200
+
+# --- Multi-phase configuration ---
+tortuosity.active_phases = 0
+tortuosity.phase_diffusivities = 1.0
+tortuosity.remspot_passes = 0
diff --git a/tests/tFloodFill.cpp b/tests/tFloodFill.cpp
new file mode 100644
index 0000000..6cca044
--- /dev/null
+++ b/tests/tFloodFill.cpp
@@ -0,0 +1,310 @@
+// tests/tFloodFill.cpp
+//
+// Direct tests for the shared FloodFill utility (FloodFill.H/cpp).
+//
+// Validates:
+//   1. Full flood on uniform domain — all cells reachable from a single seed
+//   2. Partial flood — two isolated blocks, only seeded block is reached
+//   3. collectBoundarySeeds — correct inlet/outlet seed collection
+//   4. Multi-label flood — two separate floods with distinct labels
+
+#include "FloodFill.H"
+#include "Tortuosity.H"
+
+#include <AMReX.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_Geometry.H>
+#include <AMReX_BoxArray.H>
+#include <AMReX_DistributionMapping.H>
+#include <AMReX_iMultiFab.H>
+#include <AMReX_Print.H>
+#include <AMReX_ParallelDescriptor.H>
+#include <AMReX_Loop.H>
+
+#include <cstdlib>
+#include <string>
+
+namespace {
+
+struct TestStatus {
+    bool passed = true;
+    std::string fail_reason;
+
+    void recordFail(const std::string& reason) {
+        if (passed) {
+            passed = false;
+            fail_reason = reason;
+        }
+    }
+};
+
+} // anonymous namespace
+
+
+int main(int argc, char* argv[]) {
+    amrex::Initialize(argc, argv);
+    {
+        TestStatus status;
+        int verbose = 1;
+        int domain_size = 16;
+        int box_size = 8;
+
+        {
+            amrex::ParmParse pp;
+            pp.query("verbose", verbose);
+            pp.query("domain_size", domain_size);
+            pp.query("box_size", box_size);
+        }
+
+        const int N = domain_size;
+
+        // Setup geometry and grid
+        amrex::Box domain_box(amrex::IntVect(0, 0, 0), amrex::IntVect(N - 1, N - 1, N - 1));
+        amrex::RealBox rb({AMREX_D_DECL(0.0, 0.0, 0.0)},
+                          {AMREX_D_DECL(amrex::Real(N), amrex::Real(N), amrex::Real(N))});
+        amrex::Array<int, AMREX_SPACEDIM> is_periodic{AMREX_D_DECL(0, 0, 0)};
+        amrex::Geometry geom;
+        geom.define(domain_box, &rb, 0, is_periodic.data());
+
+        amrex::BoxArray ba(domain_box);
+        ba.maxSize(box_size);
+        amrex::DistributionMapping dm(ba);
+
+        // ================================================================
+        // Test 1: Full flood on uniform domain
+        // ================================================================
+        if (status.passed) {
+            amrex::iMultiFab mf_phase(ba, dm, 1, 1);
+            mf_phase.setVal(0); // all phase 0
+            mf_phase.FillBoundary(geom.periodicity());
+
+            amrex::iMultiFab mask(ba, dm, 1, 1);
+            mask.setVal(OpenImpala::FLOOD_INACTIVE);
+
+            amrex::Vector<amrex::IntVect> seeds = {amrex::IntVect(0, 0, 0)};
+            OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds, geom, verbose);
+
+            // Count reachable cells
+            long long reached = 0;
+            for (amrex::MFIter mfi(mask); mfi.isValid(); ++mfi) {
+                const amrex::Box& bx = mfi.validbox();
+                const auto arr = mask.const_array(mfi);
+                amrex::LoopOnCpu(bx, [&](int i, int j, int k) {
+                    if (arr(i, j, k, 0) == OpenImpala::FLOOD_ACTIVE) {
+                        reached++;
+                    }
+                });
+            }
+            amrex::ParallelAllReduce::Sum(reached, amrex::ParallelContext::CommunicatorSub());
+
+            long long expected = static_cast<long long>(N) * N * N;
+            if (reached != expected) {
+                status.recordFail("Test 1 (full flood): reached=" + std::to_string(reached) +
+                                  ", expected=" + std::to_string(expected));
+            }
+            if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Test 1 (full flood):       PASS (" << reached << "/" << expected
+                               << " cells)\n";
+            }
+        }
+
+        // ================================================================
+        // Test 2: Partial flood — two isolated blocks
+        // ================================================================
+        if (status.passed) {
+            amrex::iMultiFab mf_phase(ba, dm, 1, 1);
+            mf_phase.setVal(1); // background = phase 1
+
+            int cube_size = 3;
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+            for (amrex::MFIter mfi(mf_phase, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                const amrex::Box& bx = mfi.growntilebox();
+                auto arr = mf_phase.array(mfi);
+                amrex::LoopOnCpu(bx, [&](int i, int j, int k) {
+                    // Block A at corner (1,1,1)
+                    if (i >= 1 && i < 1 + cube_size && j >= 1 && j < 1 + cube_size && k >= 1 &&
+                        k < 1 + cube_size) {
+                        arr(i, j, k, 0) = 0;
+                    }
+                    // Block B at far corner, separated by gap
+                    int lo2 = N - 1 - cube_size;
+                    if (i >= lo2 && i < lo2 + cube_size && j >= lo2 && j < lo2 + cube_size &&
+                        k >= lo2 && k < lo2 + cube_size) {
+                        arr(i, j, k, 0) = 0;
+                    }
+                });
+            }
+            mf_phase.FillBoundary(geom.periodicity());
+
+            // Seed only in block A
+            amrex::iMultiFab mask(ba, dm, 1, 1);
+            mask.setVal(OpenImpala::FLOOD_INACTIVE);
+
+            amrex::Vector<amrex::IntVect> seeds = {amrex::IntVect(1, 1, 1)};
+            OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds, geom, verbose);
+
+            long long reached = 0;
+            for (amrex::MFIter mfi(mask); mfi.isValid(); ++mfi) {
+                const amrex::Box& bx = mfi.validbox();
+                const auto arr = mask.const_array(mfi);
+                amrex::LoopOnCpu(bx, [&](int i, int j, int k) {
+                    if (arr(i, j, k, 0) == OpenImpala::FLOOD_ACTIVE) {
+                        reached++;
+                    }
+                });
+            }
+            amrex::ParallelAllReduce::Sum(reached, amrex::ParallelContext::CommunicatorSub());
+
+            long long expected_block_vol =
+                static_cast<long long>(cube_size) * cube_size * cube_size;
+            if (reached != expected_block_vol) {
+                status.recordFail("Test 2 (partial flood): reached=" + std::to_string(reached) +
+                                  ", expected=" + std::to_string(expected_block_vol));
+            }
+            if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Test 2 (partial flood):    PASS (" << reached << " cells, "
+                               << "only block A)\n";
+            }
+        }
+
+        // ================================================================
+        // Test 3: collectBoundarySeeds
+        // ================================================================
+        if (status.passed) {
+            amrex::iMultiFab mf_phase(ba, dm, 1, 1);
+            mf_phase.setVal(0); // all phase 0
+            mf_phase.FillBoundary(geom.periodicity());
+
+            amrex::Vector<amrex::IntVect> inletSeeds, outletSeeds;
+            OpenImpala::collectBoundarySeeds(mf_phase, 0, 0 /* X direction */, geom, inletSeeds,
+                                             outletSeeds);
+
+            // For X direction: inlet = i=0 face, outlet = i=N-1 face
+            // Each face has N*N cells
+            long long expected_seeds = static_cast<long long>(N) * N;
+            if (static_cast<long long>(inletSeeds.size()) != expected_seeds) {
+                status.recordFail(
+                    "Test 3 (boundary seeds): inlet seeds=" + std::to_string(inletSeeds.size()) +
+                    ", expected=" + std::to_string(expected_seeds));
+            }
+            if (static_cast<long long>(outletSeeds.size()) != expected_seeds) {
+                status.recordFail(
+                    "Test 3 (boundary seeds): outlet seeds=" + std::to_string(outletSeeds.size()) +
+                    ", expected=" + std::to_string(expected_seeds));
+            }
+
+            // Verify all inlet seeds have i=0
+            for (const auto& s : inletSeeds) {
+                if (s[0] != 0) {
+                    status.recordFail("Test 3 (boundary seeds): inlet seed with i=" +
+                                      std::to_string(s[0]) + " (expected 0)");
+                    break;
+                }
+            }
+            // Verify all outlet seeds have i=N-1
+            for (const auto& s : outletSeeds) {
+                if (s[0] != N - 1) {
+                    status.recordFail(
+                        "Test 3 (boundary seeds): outlet seed with i=" + std::to_string(s[0]) +
+                        " (expected " + std::to_string(N - 1) + ")");
+                    break;
+                }
+            }
+
+            if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Test 3 (boundary seeds):   PASS (inlet=" << inletSeeds.size()
+                               << ", outlet=" << outletSeeds.size() << ")\n";
+            }
+        }
+
+        // ================================================================
+        // Test 4: Multi-label flood (two labels on same mask)
+        // ================================================================
+        if (status.passed) {
+            amrex::iMultiFab mf_phase(ba, dm, 1, 1);
+            mf_phase.setVal(1); // background
+
+            int cube_size = 3;
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+            for (amrex::MFIter mfi(mf_phase, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                const amrex::Box& bx = mfi.growntilebox();
+                auto arr = mf_phase.array(mfi);
+                amrex::LoopOnCpu(bx, [&](int i, int j, int k) {
+                    if (i >= 1 && i < 1 + cube_size && j >= 1 && j < 1 + cube_size && k >= 1 &&
+                        k < 1 + cube_size) {
+                        arr(i, j, k, 0) = 0;
+                    }
+                    int lo2 = N - 1 - cube_size;
+                    if (i >= lo2 && i < lo2 + cube_size && j >= lo2 && j < lo2 + cube_size &&
+                        k >= lo2 && k < lo2 + cube_size) {
+                        arr(i, j, k, 0) = 0;
+                    }
+                });
+            }
+            mf_phase.FillBoundary(geom.periodicity());
+
+            amrex::iMultiFab mask(ba, dm, 1, 1);
+            mask.setVal(OpenImpala::FLOOD_INACTIVE);
+
+            // Flood block A with label=1
+            amrex::Vector<amrex::IntVect> seeds_a = {amrex::IntVect(1, 1, 1)};
+            OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds_a, geom, verbose, 1);
+
+            // Flood block B with label=2
+            int lo2 = N - 1 - cube_size;
+            amrex::Vector<amrex::IntVect> seeds_b = {amrex::IntVect(lo2, lo2, lo2)};
+            OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds_b, geom, verbose, 2);
+
+            long long count_1 = 0, count_2 = 0;
+            for (amrex::MFIter mfi(mask); mfi.isValid(); ++mfi) {
+                const amrex::Box& bx = mfi.validbox();
+                const auto arr = mask.const_array(mfi);
+                amrex::LoopOnCpu(bx, [&](int i, int j, int k) {
+                    if (arr(i, j, k, 0) == 1)
+                        count_1++;
+                    if (arr(i, j, k, 0) == 2)
+                        count_2++;
+                });
+            }
+            amrex::ParallelAllReduce::Sum(count_1, amrex::ParallelContext::CommunicatorSub());
+            amrex::ParallelAllReduce::Sum(count_2, amrex::ParallelContext::CommunicatorSub());
+
+            long long expected_vol = static_cast<long long>(cube_size) * cube_size * cube_size;
+            if (count_1 != expected_vol) {
+                status.recordFail("Test 4 (multi-label): label 1 count=" + std::to_string(count_1) +
+                                  ", expected=" + std::to_string(expected_vol));
+            }
+            if (count_2 != expected_vol) {
+                status.recordFail("Test 4 (multi-label): label 2 count=" + std::to_string(count_2) +
+                                  ", expected=" + std::to_string(expected_vol));
+            }
+
+            if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Test 4 (multi-label):      PASS (label1=" << count_1
+                               << ", label2=" << count_2 << ")\n";
+            }
+        }
+
+        // ================================================================
+        // Final summary
+        // ================================================================
+        if (amrex::ParallelDescriptor::IOProcessor()) {
+            if (status.passed) {
+                amrex::Print() << "\n--- TEST RESULT: PASS ---\n";
+            } else {
+                amrex::Print() << "\n--- TEST RESULT: FAIL ---\n";
+                amrex::Print() << "  Reason: " << status.fail_reason << "\n";
+            }
+        }
+
+        if (!status.passed) {
+            amrex::Abort("tFloodFill Test FAILED.");
+        }
+    }
+    amrex::Finalize();
+    return 0;
+}
diff --git a/tests/tTortuosityMLMG.cpp b/tests/tTortuosityMLMG.cpp
new file mode 100644
index 0000000..63403bf
--- /dev/null
+++ b/tests/tTortuosityMLMG.cpp
@@ -0,0 +1,219 @@
+// tests/tTortuosityMLMG.cpp
+//
+// Validates the TortuosityMLMG matrix-free solver against known analytical
+// solutions. Uses the same synthetic geometry approach as tMultiPhaseTransport.
+//
+// Test cases (selected via inputs):
+//   uniform:  All cells = phase 0, tau = (N-1)/N
+//   twophase: Alternating layers with equal D, tau = (N-1)/N
+
+#include "TortuosityMLMG.H"
+#include "Tortuosity.H"
+#include "SolverConfig.H"
+
+#include <AMReX.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_Utility.H>
+#include <AMReX_Geometry.H>
+#include <AMReX_BoxArray.H>
+#include <AMReX_DistributionMapping.H>
+#include <AMReX_iMultiFab.H>
+#include <AMReX_Print.H>
+#include <AMReX_Loop.H>
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <limits>
+#include <memory>
+
+
+int main(int argc, char* argv[]) {
+    amrex::Initialize(argc, argv);
+    {
+        amrex::Real strt_time = amrex::second();
+        bool test_passed = true;
+        std::string fail_reason;
+
+        // --- Configuration via ParmParse ---
+        int domain_size = 32;
+        int box_size = 16;
+        int verbose = 1;
+        int num_phases_fill = 1;
+        std::string direction_str = "X";
+        amrex::Real expected_tau = 1.0;
+        amrex::Real tau_tolerance = 1e-3;
+        std::string resultsdir = "./tTortuosityMLMG_results";
+
+        {
+            amrex::ParmParse pp;
+            pp.query("domain_size", domain_size);
+            pp.query("box_size", box_size);
+            pp.query("verbose", verbose);
+            pp.query("num_phases_fill", num_phases_fill);
+            pp.query("direction", direction_str);
+            pp.query("expected_tau", expected_tau);
+            pp.query("tau_tolerance", tau_tolerance);
+            pp.query("resultsdir", resultsdir);
+        }
+
+        OpenImpala::Direction direction = OpenImpala::parseDirection(direction_str);
+
+        if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+            amrex::Print() << "\n--- TortuosityMLMG Test ---\n";
+            amrex::Print() << "  Domain Size:       " << domain_size << "^3\n";
+            amrex::Print() << "  Direction:         " << direction_str << "\n";
+            amrex::Print() << "  Expected Tau:      " << expected_tau << "\n";
+            amrex::Print() << "  Tau Tolerance:     " << tau_tolerance << "\n";
+            amrex::Print() << "-------------------------------\n\n";
+        }
+
+        // --- Create synthetic domain ---
+        amrex::Box domain_box(amrex::IntVect(0, 0, 0),
+                              amrex::IntVect(domain_size - 1, domain_size - 1, domain_size - 1));
+        amrex::RealBox rb({AMREX_D_DECL(0.0, 0.0, 0.0)},
+                          {AMREX_D_DECL(amrex::Real(domain_size), amrex::Real(domain_size),
+                                        amrex::Real(domain_size))});
+        amrex::Array<int, AMREX_SPACEDIM> is_periodic{AMREX_D_DECL(0, 0, 0)};
+        amrex::Geometry geom;
+        geom.define(domain_box, &rb, 0, is_periodic.data());
+
+        amrex::BoxArray ba(domain_box);
+        ba.maxSize(box_size);
+        amrex::DistributionMapping dm(ba);
+
+        // --- Create and fill phase field ---
+        amrex::iMultiFab mf_phase(ba, dm, 1, 1);
+
+        if (num_phases_fill == 1) {
+            mf_phase.setVal(0);
+        } else {
+            // Alternating layers along X
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+            for (amrex::MFIter mfi(mf_phase, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                const amrex::Box& bx = mfi.growntilebox();
+                amrex::Array4<int> const phase_arr = mf_phase.array(mfi);
+                int dir_idx = static_cast<int>(direction);
+                amrex::LoopOnCpu(bx, [&](int i, int j, int k) {
+                    int coord = (dir_idx == 0) ? i : (dir_idx == 1) ? j : k;
+                    phase_arr(i, j, k, 0) = (coord % 2 == 0) ? 0 : 1;
+                });
+            }
+        }
+        mf_phase.FillBoundary(geom.periodicity());
+
+        amrex::Real vf = 1.0;
+
+        // --- Create results directory ---
+        if (!resultsdir.empty() && amrex::ParallelDescriptor::IOProcessor()) {
+            amrex::UtilCreateDirectory(resultsdir, 0755);
+        }
+        amrex::ParallelDescriptor::Barrier();
+
+        // --- Construct and run TortuosityMLMG ---
+        std::unique_ptr<OpenImpala::TortuosityMLMG> tort;
+        try {
+            tort = std::make_unique<OpenImpala::TortuosityMLMG>(
+                geom, ba, dm, mf_phase, vf, 0 /* phase_id */, direction, resultsdir, 0.0 /* vlo */,
+                1.0 /* vhi */, verbose, false /* write_plotfile */);
+        } catch (const std::exception& e) {
+            test_passed = false;
+            fail_reason = "TortuosityMLMG construction failed: " + std::string(e.what());
+        }
+
+        // --- Calculate tortuosity ---
+        amrex::Real actual_tau = std::numeric_limits<amrex::Real>::quiet_NaN();
+        if (test_passed && tort) {
+            try {
+                actual_tau = tort->value();
+                if (std::isnan(actual_tau) || std::isinf(actual_tau)) {
+                    test_passed = false;
+                    fail_reason = "Tortuosity value is NaN or Inf";
+                }
+            } catch (const std::exception& e) {
+                test_passed = false;
+                fail_reason = "Exception during solve: " + std::string(e.what());
+            }
+        }
+
+        // --- Check solver convergence ---
+        if (test_passed && tort) {
+            if (!tort->getSolverConverged()) {
+                test_passed = false;
+                fail_reason = "MLMG solver did not converge (residual=" +
+                              std::to_string(tort->getFinalRelativeResidualNorm()) + ")";
+            } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Solver convergence:       PASS (" << tort->getSolverIterations()
+                               << " iterations, residual=" << tort->getFinalRelativeResidualNorm()
+                               << ")\n";
+            }
+        }
+
+        // --- Validate tortuosity against expected value ---
+        if (test_passed) {
+            amrex::Real diff = std::abs(actual_tau - expected_tau);
+            if (diff > tau_tolerance) {
+                test_passed = false;
+                fail_reason = "Tortuosity mismatch. Expected: " + std::to_string(expected_tau) +
+                              ", Got: " + std::to_string(actual_tau) +
+                              ", Diff: " + std::to_string(diff);
+            } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Tortuosity value check:   PASS (tau=" << actual_tau
+                               << ", expected=" << expected_tau << ", diff=" << diff << ")\n";
+            }
+        }
+
+        // --- Check active volume fraction ---
+        if (test_passed && tort) {
+            amrex::Real active_vf = tort->getActiveVolumeFraction();
+            if (active_vf < 0.99) {
+                test_passed = false;
+                fail_reason =
+                    "Active volume fraction unexpectedly low: " + std::to_string(active_vf);
+            } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Active VF check:          PASS (active_vf=" << active_vf
+                               << ")\n";
+            }
+        }
+
+        // --- Check plane flux conservation ---
+        if (test_passed && tort) {
+            const auto& plane_fluxes = tort->getPlaneFluxes();
+            amrex::Real max_dev = tort->getPlaneFluxMaxDeviation();
+            constexpr amrex::Real plane_flux_tol = 1.0e-6;
+
+            if (!plane_fluxes.empty() && max_dev > plane_flux_tol) {
+                test_passed = false;
+                fail_reason =
+                    "Plane flux conservation failed. Max deviation: " + std::to_string(max_dev);
+            } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) {
+                amrex::Print() << " Plane flux conservation:  PASS (" << plane_fluxes.size()
+                               << " faces, max_dev=" << std::scientific << max_dev
+                               << std::defaultfloat << ")\n";
+            }
+        }
+
+        // --- Final summary ---
+        amrex::Real stop_time = amrex::second() - strt_time;
+        amrex::ParallelDescriptor::ReduceRealMax(stop_time,
+                                                 amrex::ParallelDescriptor::IOProcessorNumber());
+
+        if (amrex::ParallelDescriptor::IOProcessor()) {
+            amrex::Print() << "\n Run time = " << stop_time << " sec\n";
+            if (test_passed) {
+                amrex::Print() << "\n--- TEST RESULT: PASS ---\n";
+            } else {
+                amrex::Print() << "\n--- TEST RESULT: FAIL ---\n";
+                amrex::Print() << "  Reason: " << fail_reason << "\n";
+            }
+        }
+
+        if (!test_passed) {
+            amrex::Abort("TortuosityMLMG Test FAILED.");
+        }
+    }
+    amrex::Finalize();
+    return 0;
+}
diff --git a/tutorials/01_hello_openimpala.ipynb b/tutorials/01_hello_openimpala.ipynb
index 9968cad..38b4403 100644
--- a/tutorials/01_hello_openimpala.ipynb
+++ b/tutorials/01_hello_openimpala.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala, PoreSpy for structure generation, and Matplotlib for plots.\n!pip install -q openimpala porespy matplotlib"
+   "source": "# Install OpenImpala, PoreSpy for structure generation, and Matplotlib for plots.\n!pip install -q openimpala-cuda porespy matplotlib"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/02_digital_twin.ipynb b/tutorials/02_digital_twin.ipynb
index 6277621..7446340 100644
--- a/tutorials/02_digital_twin.ipynb
+++ b/tutorials/02_digital_twin.ipynb
@@ -12,7 +12,7 @@
    "outputs": [],
    "source": [
     "# Install OpenImpala, PyBaMM, and visualization utilities\n",
-    "!pip install -q openimpala pybamm bpx tifffile matplotlib yt"
+    "!pip install -q openimpala-cuda pybamm bpx tifffile matplotlib yt"
    ]
   },
   {
diff --git a/tutorials/03_rev_and_uncertainty.ipynb b/tutorials/03_rev_and_uncertainty.ipynb
index 0848851..9d7033c 100644
--- a/tutorials/03_rev_and_uncertainty.ipynb
+++ b/tutorials/03_rev_and_uncertainty.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala and utilities\n!pip install -q openimpala tifffile matplotlib scipy"
+   "source": "# Install OpenImpala and utilities\n!pip install -q openimpala-cuda tifffile matplotlib scipy"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/04_multiphase_and_fields.ipynb b/tutorials/04_multiphase_and_fields.ipynb
index 0e4c463..fc1f089 100644
--- a/tutorials/04_multiphase_and_fields.ipynb
+++ b/tutorials/04_multiphase_and_fields.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala, PoreSpy, yt (for AMReX plotfile visualisation), and Matplotlib.\n!pip install -q openimpala porespy yt matplotlib"
+   "source": "# Install OpenImpala, PoreSpy, yt (for AMReX plotfile visualisation), and Matplotlib.\n!pip install -q openimpala-cuda porespy yt matplotlib"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/05_surrogate_modelling.ipynb b/tutorials/05_surrogate_modelling.ipynb
index 3505cd7..cf8082f 100644
--- a/tutorials/05_surrogate_modelling.ipynb
+++ b/tutorials/05_surrogate_modelling.ipynb
@@ -12,7 +12,7 @@
    "outputs": [],
    "source": [
     "# Install OpenImpala and ML libraries\n",
-    "!pip install -q openimpala porespy scikit-learn matplotlib"
+    "!pip install -q openimpala-cuda porespy scikit-learn matplotlib"
    ]
   },
   {
diff --git a/tutorials/06_topology_optimisation.ipynb b/tutorials/06_topology_optimisation.ipynb
index b668c30..31bd69a 100644
--- a/tutorials/06_topology_optimisation.ipynb
+++ b/tutorials/06_topology_optimisation.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "!pip install -q openimpala matplotlib"
+   "source": "!pip install -q openimpala-cuda matplotlib"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/07_hpc_scaling.ipynb b/tutorials/07_hpc_scaling.ipynb
index 7c3cb77..a10b2ce 100644
--- a/tutorials/07_hpc_scaling.ipynb
+++ b/tutorials/07_hpc_scaling.ipynb
@@ -242,7 +242,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "!pip install -q openimpala porespy matplotlib"
+   "source": "!pip install -q openimpala-cuda porespy matplotlib"
   },
   {
    "cell_type": "code",
@@ -266,7 +266,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## Summary\n\n| Scenario | Approach |\n|----------|----------|\n| **Laptop / Colab** | `pip install openimpala`, use NumPy arrays directly |\n| **Small cluster (1-16 cores)** | `mpirun -np 16 python script.py` with NumPy loading |\n| **Large cluster (16-128+ cores)** | `mpirun -np 128 python script.py` with `oi.read_image()` for parallel I/O |\n| **HPC without Python** | `mpirun -np 128 Diffusion ./inputs` (pure C++ application) |\n\n### Solver Quick Reference\n\n| Solver | Best For | Notes |\n|--------|----------|-------|\n| **FlexGMRES** | General use | Robust default, handles non-symmetric systems |\n| **PCG** | Symmetric problems | Fastest when applicable |\n| **SMG/PFMG** | Structured grids | Geometric multigrid, excellent on regular domains |\n| **BiCGSTAB** | Non-symmetric | Alternative to GMRES |\n\nThe API is the same at every scale — only the launch command changes. The scaling study and solver comparison in Sections 6-7 provide concrete data to guide your deployment decisions.\n\n**Back to the beginning:**\n- [Tutorial 1: Getting Started](01_hello_openimpala.ipynb) — Core workflow with synthetic microstructures.\n- [Tutorial 2: From 3D Image to Device Model](02_digital_twin.ipynb) — Load real CT scans and export to PyBaMM.\n\n---\n\n## References & Further Reading\n\n1. **OpenImpala:** S. Mayner, F. Ciucci, *OpenImpala: open-source computational framework for microstructural analysis of 3D tomography data*, SoftwareX (2024). [GitHub](https://github.com/BASE-Laboratory/OpenImpala)\n2. **AMReX:** W. Zhang et al., *AMReX: a framework for block-structured adaptive mesh refinement*, J. Open Source Software 4(37), 1370 (2019). [doi:10.21105/joss.01370](https://doi.org/10.21105/joss.01370)\n3. **AMReX scaling:** A. S. Almgren et al., *Block-structured adaptive mesh refinement — theory, implementation and application*, J. Comput. Physics 332, 1-28 (2017). [doi:10.1016/j.jcp.2016.12.073](https://doi.org/10.1016/j.jcp.2016.12.073)\n4. **HYPRE:** R. D. Falgout & U. M. Yang, *hypre: A library of high performance preconditioners*, Computational Science — ICCS 2002, LNCS 2331, pp. 632-641 (2002). [doi:10.1007/3-540-47789-6_66](https://doi.org/10.1007/3-540-47789-6_66)\n5. **Parallel HDF5:** The HDF Group, *HDF5 — Parallel I/O*, [hdfgroup.org](https://www.hdfgroup.org/solutions/hdf5/)\n6. **Apptainer/Singularity for HPC:** G. M. Kurtzer et al., *Singularity: Scientific containers for mobility of compute*, PLoS ONE 12(5), e0177459 (2017). [doi:10.1371/journal.pone.0177459](https://doi.org/10.1371/journal.pone.0177459)\n7. **MPI standard:** Message Passing Interface Forum, *MPI: A Message-Passing Interface Standard, Version 4.0* (2021). [mpi-forum.org](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf)"
+   "source": "## Summary\n\n| Scenario | Approach |\n|----------|----------|\n| **Laptop / Colab** | `pip install openimpala-cuda`, use NumPy arrays directly |\n| **Small cluster (1-16 cores)** | `mpirun -np 16 python script.py` with NumPy loading |\n| **Large cluster (16-128+ cores)** | `mpirun -np 128 python script.py` with `oi.read_image()` for parallel I/O |\n| **HPC without Python** | `mpirun -np 128 Diffusion ./inputs` (pure C++ application) |\n\n### Solver Quick Reference\n\n| Solver | Best For | Notes |\n|--------|----------|-------|\n| **FlexGMRES** | General use | Robust default, handles non-symmetric systems |\n| **PCG** | Symmetric problems | Fastest when applicable |\n| **SMG/PFMG** | Structured grids | Geometric multigrid, excellent on regular domains |\n| **BiCGSTAB** | Non-symmetric | Alternative to GMRES |\n\nThe API is the same at every scale — only the launch command changes. The scaling study and solver comparison in Sections 6-7 provide concrete data to guide your deployment decisions.\n\n**Back to the beginning:**\n- [Tutorial 1: Getting Started](01_hello_openimpala.ipynb) — Core workflow with synthetic microstructures.\n- [Tutorial 2: From 3D Image to Device Model](02_digital_twin.ipynb) — Load real CT scans and export to PyBaMM.\n\n---\n\n## References & Further Reading\n\n1. **OpenImpala:** S. Mayner, F. Ciucci, *OpenImpala: open-source computational framework for microstructural analysis of 3D tomography data*, SoftwareX (2024). [GitHub](https://github.com/BASE-Laboratory/OpenImpala)\n2. **AMReX:** W. Zhang et al., *AMReX: a framework for block-structured adaptive mesh refinement*, J. Open Source Software 4(37), 1370 (2019). [doi:10.21105/joss.01370](https://doi.org/10.21105/joss.01370)\n3. **AMReX scaling:** A. S. Almgren et al., *Block-structured adaptive mesh refinement — theory, implementation and application*, J. Comput. Physics 332, 1-28 (2017). [doi:10.1016/j.jcp.2016.12.073](https://doi.org/10.1016/j.jcp.2016.12.073)\n4. **HYPRE:** R. D. Falgout & U. M. Yang, *hypre: A library of high performance preconditioners*, Computational Science — ICCS 2002, LNCS 2331, pp. 632-641 (2002). [doi:10.1007/3-540-47789-6_66](https://doi.org/10.1007/3-540-47789-6_66)\n5. **Parallel HDF5:** The HDF Group, *HDF5 — Parallel I/O*, [hdfgroup.org](https://www.hdfgroup.org/solutions/hdf5/)\n6. **Apptainer/Singularity for HPC:** G. M. Kurtzer et al., *Singularity: Scientific containers for mobility of compute*, PLoS ONE 12(5), e0177459 (2017). [doi:10.1371/journal.pone.0177459](https://doi.org/10.1371/journal.pone.0177459)\n7. **MPI standard:** Message Passing Interface Forum, *MPI: A Message-Passing Interface Standard, Version 4.0* (2021). [mpi-forum.org](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf)"
   }
  ],
  "metadata": {