BASE-Laboratory · jameslehoux · Mar 29, 2026 · Mar 22, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
@@ -31,7 +31,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-gpu-cuda12-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-v1
+          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-v2
 
       - name: Build GPU wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -42,19 +42,24 @@ jobs:
 
           # Use NVIDIA's CUDA-enabled manylinux image (CUDA 12.6, AlmaLinux 8)
           # This provides nvcc, CUDA runtime, and cuBLAS/cuSPARSE out of the box.
-          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux_2_28_x86_64_cuda_12.6
+          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux_2_34_x86_64_cuda_12.6
 
           # Build all dependencies with CUDA support.
           # HDF5 and libtiff are CPU-only (no GPU path needed).
           # HYPRE is built with --with-cuda for GPU-accelerated solves.
           # AMReX is built with -DAMReX_GPU_BACKEND=CUDA for device kernels.
           CIBW_BEFORE_ALL_LINUX: >
             dnf install -y epel-release &&
-            dnf --enablerepo=powertools install -y
+            dnf --enablerepo=crb install -y
             openmpi-devel gcc-gfortran gcc-c++ wget git
-            zlib-devel libjpeg-turbo-devel python3-pip &&
+            zlib-devel libjpeg-turbo-devel python3-pip
+            gcc-toolset-13 gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
+            gcc-toolset-13-gcc-gfortran &&
             pip3 install "cmake>=3.28,<4" &&
-            export PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH &&
+            export PATH=/opt/rh/gcc-toolset-13/root/usr/bin:/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH &&
+            export CC=/opt/rh/gcc-toolset-13/root/usr/bin/gcc &&
+            export CXX=/opt/rh/gcc-toolset-13/root/usr/bin/g++ &&
+            export FC=/opt/rh/gcc-toolset-13/root/usr/bin/gfortran &&
             export CUDA_HOME=/usr/local/cuda &&
             if [ -f /project/.cibw-deps-cache/deps.tar.gz ]; then
             echo "=== Restoring cached GPU dependencies ===" &&
@@ -92,7 +97,8 @@ jobs:
             --with-cuda-home=/usr/local/cuda --enable-shared=no
             CC=mpicc CXX=mpicxx FC=mpif90
             CFLAGS="-O2 -fPIC" CXXFLAGS="-O2 -fPIC" FFLAGS="-O2 -fPIC"
-            CUDA_HOME=/usr/local/cuda &&
+            CUDA_HOME=/usr/local/cuda
+            CUDAFLAGS="-allow-unsupported-compiler" &&
             make -j$(nproc) &&
             make install &&
             cd ../.. &&
@@ -107,26 +113,35 @@ jobs:
             -DAMReX_FORTRAN=ON
             -DAMReX_PARTICLES=OFF
             -DAMReX_GPU_BACKEND=CUDA
-            -DAMReX_CUDA_ARCH=60;70;75;80;86;89;90
+            '-DAMReX_CUDA_ARCH=60;70;75;80;86;89;90'
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_CUDA_ARCHITECTURES="60;70;75;80;86;89;90" &&
+            '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90'
+            -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++ &&
             cmake --build /tmp/amrex/build -j$(nproc) &&
             cmake --install /tmp/amrex/build &&
             mkdir -p /project/.cibw-deps-cache &&
             tar czf /project/.cibw-deps-cache/deps.tar.gz /usr/local ;
             fi
 
-          CIBW_BEFORE_BUILD: pip install "cmake>=3.28,<4"
+          # Rename the package to openimpala-cuda for the GPU wheel.
+          # The import name stays 'openimpala' — only the PyPI distribution name changes.
+          CIBW_BEFORE_BUILD: >
+            pip install "cmake>=3.28,<4" &&
+            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml
 
           # Point to MPI, CUDA, and our compiled GPU dependencies.
           CIBW_ENVIRONMENT_LINUX: >
-            PATH="/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH"
+            PATH="/opt/rh/gcc-toolset-13/root/usr/bin:/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH"
+            CC="/opt/rh/gcc-toolset-13/root/usr/bin/gcc"
+            CXX="/opt/rh/gcc-toolset-13/root/usr/bin/g++"
+            FC="/opt/rh/gcc-toolset-13/root/usr/bin/gfortran"
             CUDA_HOME="/usr/local/cuda"
+            CUDAHOSTCXX="/opt/rh/gcc-toolset-13/root/usr/bin/g++"
             CMAKE_C_COMPILER="mpicc"
             CMAKE_CXX_COMPILER="mpicxx"
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
-            CMAKE_ARGS="-DGPU_BACKEND=CUDA -DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90"
+            CMAKE_ARGS="-DGPU_BACKEND=CUDA '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++"
 
           # Vendor libraries but exclude host-specific MPI, OpenMP, Fortran runtime,
           # and CUDA runtime libraries (users must have CUDA toolkit installed).

diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb
@@ -75,7 +75,7 @@
    "source": [
     "# Install system MPI and Python packages\n",
     "!apt-get install -y libopenmpi-dev > /dev/null 2>&1\n",
-    "!pip install openimpala[all] > /dev/null 2>&1\n",
+    "!pip install openimpala-cuda[all] > /dev/null 2>&1\n",
     "!pip install porespy > /dev/null 2>&1\n",
     "print(\"Dependencies installed.\")"
    ]

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -21,14 +21,21 @@ find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
 find_package(pybind11 2.11 CONFIG REQUIRED)
 
 # --- Build the extension module ---
-pybind11_add_module(_core
+set(BINDING_SOURCES
     bindings/module.cpp
     bindings/enums.cpp
     bindings/io.cpp
     bindings/props.cpp
     bindings/solvers.cpp
     bindings/config.cpp
 )
+pybind11_add_module(_core ${BINDING_SOURCES})
+
+# When CUDA is enabled, AMReX headers contain __host__/__device__ attributes
+# that require compilation by nvcc.
+if(GPU_BACKEND STREQUAL "CUDA")
+    set_source_files_properties(${BINDING_SOURCES} PROPERTIES LANGUAGE CUDA)
+endif()
 
 target_link_libraries(_core PRIVATE
     openimpala_io

diff --git a/python/openimpala/__init__.py b/python/openimpala/__init__.py
@@ -27,7 +27,10 @@
 try:
     __version__ = version("openimpala")
 except PackageNotFoundError:
-    __version__ = "unknown"
+    try:
+        __version__ = version("openimpala-cuda")
+    except PackageNotFoundError:
+        __version__ = "unknown"
 
 # Session context manager (pure Python — always available)
 from .session import Session

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -7,14 +7,15 @@
 # IO Library
 # ==============================================================================
 # Library sources (exclude test drivers that start with 't')
-add_library(openimpala_io OBJECT
+set(IO_SOURCES
     io/CathodeWrite.cpp
     io/DatReader.cpp
     io/HDF5Reader.cpp
     io/ImageLoader.cpp
     io/RawReader.cpp
     io/TiffReader.cpp
 )
+add_library(openimpala_io OBJECT ${IO_SOURCES})
 
 target_include_directories(openimpala_io PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/io
@@ -36,7 +37,7 @@ target_include_directories(openimpala_io PUBLIC
 # ==============================================================================
 # Props Library (C++ only — Fortran kernels migrated to native C++)
 # ==============================================================================
-add_library(openimpala_props OBJECT
+set(PROPS_SOURCES
     props/ConnectedComponents.cpp
     props/DeffTensor.cpp
     props/EffectiveDiffusivityHypre.cpp
@@ -53,6 +54,7 @@ add_library(openimpala_props OBJECT
     props/TortuositySolverBase.cpp
     props/VolumeFraction.cpp
 )
+add_library(openimpala_props OBJECT ${PROPS_SOURCES})
 
 target_include_directories(openimpala_props PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/props
@@ -101,5 +103,19 @@ target_include_directories(Diffusion PRIVATE
     ${HDF5_INCLUDE_DIRS}
 )
 
+# ==============================================================================
+# CUDA: compile all C++ sources as CUDA when GPU backend is enabled
+# ==============================================================================
+# AMReX headers use __host__/__device__ attributes (AMREX_GPU_HOST_DEVICE)
+# when built with CUDA. Any translation unit that includes AMReX headers must
+# therefore be compiled by nvcc. We achieve this by setting the LANGUAGE
+# property to CUDA on all .cpp source files.
+if(GPU_BACKEND STREQUAL "CUDA")
+    set_source_files_properties(
+        ${IO_SOURCES} ${PROPS_SOURCES} props/Diffusion.cpp
+        PROPERTIES LANGUAGE CUDA
+    )
+endif()
+
 # Install the main executable
 install(TARGETS Diffusion RUNTIME DESTINATION bin)
diff --git a/src/io/HDF5Reader.H b/src/io/HDF5Reader.H
@@ -160,6 +160,14 @@ public:
     }
 
 
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
+    // Template function to read a hyperslab and apply thresholding
+    // Needed because the native type read from file varies.
+    template <typename T_Native>
+    void readAndThresholdFab(H5::DataSet& dataset, double raw_threshold, int value_if_true,
+                             int value_if_false, const amrex::Box& box,
+                             amrex::IArrayBox& fab) const;
+
 private:
     /**
      * @brief Internal implementation for reading HDF5 metadata only.
@@ -169,13 +177,6 @@ private:
      */
     bool readMetadataInternal();
 
-    // Template function to read a hyperslab and apply thresholding
-    // Needed because the native type read from file varies.
-    template <typename T_Native>
-    void readAndThresholdFab(H5::DataSet& dataset, double raw_threshold, int value_if_true,
-                             int value_if_false, const amrex::Box& box,
-                             amrex::IArrayBox& fab) const;
-
     // --- Member Variables ---
     std::string m_filename;     /**< Filename of the source HDF5 file */
     std::string m_hdf5dataset;  /**< Path to the dataset within the HDF5 file */

diff --git a/src/props/ConnectedComponents.H b/src/props/ConnectedComponents.H
@@ -59,12 +59,13 @@ public:
         return m_labels;
     }
 
-private:
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
     void run(const amrex::iMultiFab& mf_phase, int phase_id);
 
     amrex::IntVect findNextUnlabeled(const amrex::iMultiFab& labelMF,
                                      const amrex::iMultiFab& phaseFab, int phaseID) const;
 
+private:
     amrex::Geometry m_geom;
     amrex::BoxArray m_ba;
     amrex::DistributionMapping m_dm;

diff --git a/src/props/ConnectedComponents.cpp b/src/props/ConnectedComponents.cpp
@@ -108,8 +108,8 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) {
     // Compute volume of each component using GPU-compatible atomic scatter-add
     m_volumes.resize(m_num_components, 0);
 
-    amrex::Gpu::DeviceVector<long long> d_volumes(m_num_components, 0);
-    long long* d_vol_ptr = d_volumes.data();
+    amrex::Gpu::DeviceVector<int> d_volumes(m_num_components, 0);
+    int* d_vol_ptr = d_volumes.data();
     const int num_comp = m_num_components;
 
     for (amrex::MFIter mfi(m_labels); mfi.isValid(); ++mfi) {
@@ -119,20 +119,20 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) {
         amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
             int lbl = label_arr(i, j, k, 0);
             if (lbl > 0 && lbl <= num_comp) {
-                amrex::Gpu::Atomic::Add(&d_vol_ptr[lbl - 1], 1LL);
+                amrex::Gpu::Atomic::Add(&d_vol_ptr[lbl - 1], 1);
             }
         });
     }
 
-    std::vector<long long> local_volumes(m_num_components);
+    std::vector<int> local_volumes(m_num_components);
     amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_volumes.begin(), d_volumes.end(),
                      local_volumes.begin());
 
     if (m_num_components > 0) {
         amrex::ParallelAllReduce::Sum(local_volumes.data(), m_num_components,
                                       amrex::ParallelContext::CommunicatorSub());
     }
-    m_volumes = local_volumes;
+    m_volumes.assign(local_volumes.begin(), local_volumes.end());
 }
 
 } // namespace OpenImpala
diff --git a/src/props/EffectiveDiffusivityHypre.H b/src/props/EffectiveDiffusivityHypre.H
@@ -1,3 +1,3 @@
 #ifndef EFFECTIVEDIFFUSIVITYHYPRE_H
 #define EFFECTIVEDIFFUSIVITYHYPRE_H

@@ -99,6 +99,9 @@
     void getChiSolution(amrex::MultiFab& chi_field);
 
 
+    // --- Public for CUDA __device__ lambda compatibility ---
+    void initializeDiffCoeff();
+
     // --- Public Getters for Status and Solver Information ---
     // Solver statistics are inherited from HypreStructSolver:
     //   getSolverConverged(), getFinalRelativeResidualNorm(), getSolverIterations()

diff --git a/src/props/EffectiveDiffusivityHypre.cpp b/src/props/EffectiveDiffusivityHypre.cpp
@@ -164,31 +164,7 @@ EffectiveDiffusivityHypre::EffectiveDiffusivityHypre(
 
     // Build coefficient MultiFab using a device-accessible lookup table
     m_mf_diff_coeff.setVal(0.0);
-    {
-        int max_pid = 0;
-        for (const auto& kv : m_phase_coeff_map) {
-            max_pid = std::max(max_pid, kv.first);
-        }
-        amrex::Gpu::DeviceVector<amrex::Real> d_coeff_lut(max_pid + 1, 0.0);
-        amrex::Gpu::HostVector<amrex::Real> h_coeff_lut(max_pid + 1, 0.0);
-        for (const auto& kv : m_phase_coeff_map) {
-            h_coeff_lut[kv.first] = kv.second;
-        }
-        amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(),
-                         d_coeff_lut.begin());
-        const amrex::Real* lut_ptr = d_coeff_lut.data();
-        const int lut_size = max_pid + 1;
-
-        for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
-            const amrex::Box& bx = mfi.growntilebox();
-            amrex::Array4<amrex::Real> const dc_arr = m_mf_diff_coeff.array(mfi);
-            amrex::Array4<const int> const phase_arr = m_mf_phase_original.const_array(mfi);
-            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
-                int pid = phase_arr(i, j, k, 0);
-                dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0;
-            });
-        }
-    }
+    initializeDiffCoeff();
     m_mf_diff_coeff.FillBoundary(m_geom.periodicity());
 
     m_mf_active_mask.setVal(cell_inactive);
@@ -235,6 +211,32 @@ EffectiveDiffusivityHypre::EffectiveDiffusivityHypre(
 
 // Destructor is defaulted in the header — base class handles HYPRE cleanup.
 
+void EffectiveDiffusivityHypre::initializeDiffCoeff() {
+    int max_pid = 0;
+    for (const auto& kv : m_phase_coeff_map) {
+        max_pid = std::max(max_pid, kv.first);
+    }
+    amrex::Gpu::DeviceVector<amrex::Real> d_coeff_lut(max_pid + 1, 0.0);
+    amrex::Gpu::HostVector<amrex::Real> h_coeff_lut(max_pid + 1, 0.0);
+    for (const auto& kv : m_phase_coeff_map) {
+        h_coeff_lut[kv.first] = kv.second;
+    }
+    amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(),
+                     d_coeff_lut.begin());
+    const amrex::Real* lut_ptr = d_coeff_lut.data();
+    const int lut_size = max_pid + 1;
+
+    for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+        const amrex::Box& bx = mfi.growntilebox();
+        amrex::Array4<amrex::Real> const dc_arr = m_mf_diff_coeff.array(mfi);
+        amrex::Array4<const int> const phase_arr = m_mf_phase_original.const_array(mfi);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            int pid = phase_arr(i, j, k, 0);
+            dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0;
+        });
+    }
+}
+
 void EffectiveDiffusivityHypre::generateActiveMask() {
     BL_PROFILE("EffectiveDiffusivityHypre::generateActiveMask");
 
@@ -711,7 +713,8 @@ void EffectiveDiffusivityHypre::getChiSolution(amrex::MultiFab& chi_field) {
                                                             soln_buffer.data());
         if (get_ierr != 0) {
             amrex::Warning("HYPRE_StructVectorGetBoxValues failed during getChiSolution!");
-            chi_field[mfi].setVal(0.0, bx_getsol, ChiComp, numComponentsChi);
+            chi_field[mfi].template setVal<amrex::RunOn::Host>(0.0, bx_getsol, ChiComp,
+                                                               numComponentsChi);
             continue;
         }
 

diff --git a/src/props/FloodFill.cpp b/src/props/FloodFill.cpp
@@ -177,7 +177,10 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa
 
         // Reset device flag
 #ifdef AMREX_USE_GPU
-        d_changed = amrex::Gpu::DeviceScalar<int>(0);
+        {
+            int zero = 0;
+            amrex::Gpu::htod_memcpy(d_changed.dataPtr(), &zero, sizeof(int));
+        }
         d_flag_ptr = d_changed.dataPtr();
 #else
         h_changed = 0;

diff --git a/src/props/PercolationCheck.H b/src/props/PercolationCheck.H
@@ -56,9 +56,10 @@ public:
     /** @brief Returns a human-readable direction string ("X", "Y", or "Z"). */
     static std::string directionString(OpenImpala::Direction dir);
 
-private:
+    // --- Implementation methods (public for CUDA __device__ lambda compatibility) ---
     void run(const amrex::iMultiFab& mf_phase, int phase_id, OpenImpala::Direction dir);
 
+private:
     amrex::Geometry m_geom;
     amrex::BoxArray m_ba;
     amrex::DistributionMapping m_dm;

diff --git a/src/props/REVStudy.cpp b/src/props/REVStudy.cpp
@@ -131,7 +131,7 @@ void runREVStudy(const amrex::Geometry& geom_full, const amrex::BoxArray& ba_ful
                 const amrex::Box& dest_box = mfi.validbox();
                 amrex::Box src_box = dest_box;
                 src_box.shift(bx_rev.smallEnd());
-                dest_fab.copy(src_fab, src_box, 0, dest_box, 0, 1);
+                dest_fab.template copy<amrex::RunOn::Host>(src_fab, src_box, 0, dest_box, 0, 1);
             }
             mf_phase_rev.FillBoundary(geom_rev.periodicity());
 

diff --git a/src/props/ThroughThicknessProfile.H b/src/props/ThroughThicknessProfile.H
@@ -54,6 +54,10 @@ public:
         return static_cast<int>(m_vf_profile.size());
     }
 
+    /** @brief Compute the profile (public for CUDA __device__ lambda compatibility). */
+    void compute(const amrex::Geometry& geom, const amrex::iMultiFab& mf_phase, int phase_id,
+                 OpenImpala::Direction dir, int comp);
+
 private:
     std::vector<amrex::Real> m_vf_profile;
 };