diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml index 0302f74..82b2227 100644 --- a/.github/workflows/pypi-wheels-gpu.yml +++ b/.github/workflows/pypi-wheels-gpu.yml @@ -31,7 +31,7 @@ jobs: uses: actions/cache@v4 with: path: .cibw-deps-cache - key: cibw-deps-gpu-cuda12-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-v1 + key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-v2 - name: Build GPU wheels run: python -m cibuildwheel --output-dir wheelhouse @@ -42,7 +42,7 @@ jobs: # Use NVIDIA's CUDA-enabled manylinux image (CUDA 12.6, AlmaLinux 8) # This provides nvcc, CUDA runtime, and cuBLAS/cuSPARSE out of the box. - CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux_2_28_x86_64_cuda_12.6 + CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux_2_34_x86_64_cuda_12.6 # Build all dependencies with CUDA support. # HDF5 and libtiff are CPU-only (no GPU path needed). @@ -50,11 +50,16 @@ jobs: # AMReX is built with -DAMReX_GPU_BACKEND=CUDA for device kernels. CIBW_BEFORE_ALL_LINUX: > dnf install -y epel-release && - dnf --enablerepo=powertools install -y + dnf --enablerepo=crb install -y openmpi-devel gcc-gfortran gcc-c++ wget git - zlib-devel libjpeg-turbo-devel python3-pip && + zlib-devel libjpeg-turbo-devel python3-pip + gcc-toolset-13 gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ + gcc-toolset-13-gcc-gfortran && pip3 install "cmake>=3.28,<4" && - export PATH=/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH && + export PATH=/opt/rh/gcc-toolset-13/root/usr/bin:/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH && + export CC=/opt/rh/gcc-toolset-13/root/usr/bin/gcc && + export CXX=/opt/rh/gcc-toolset-13/root/usr/bin/g++ && + export FC=/opt/rh/gcc-toolset-13/root/usr/bin/gfortran && export CUDA_HOME=/usr/local/cuda && if [ -f /project/.cibw-deps-cache/deps.tar.gz ]; then echo "=== Restoring cached GPU dependencies ===" && @@ -92,7 +97,8 @@ jobs: --with-cuda-home=/usr/local/cuda --enable-shared=no CC=mpicc CXX=mpicxx FC=mpif90 CFLAGS="-O2 -fPIC" CXXFLAGS="-O2 -fPIC" FFLAGS="-O2 -fPIC" - CUDA_HOME=/usr/local/cuda && + CUDA_HOME=/usr/local/cuda + CUDAFLAGS="-allow-unsupported-compiler" && make -j$(nproc) && make install && cd ../.. && @@ -107,26 +113,35 @@ jobs: -DAMReX_FORTRAN=ON -DAMReX_PARTICLES=OFF -DAMReX_GPU_BACKEND=CUDA - -DAMReX_CUDA_ARCH=60;70;75;80;86;89;90 + '-DAMReX_CUDA_ARCH=60;70;75;80;86;89;90' -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_CUDA_ARCHITECTURES="60;70;75;80;86;89;90" && + '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' + -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++ && cmake --build /tmp/amrex/build -j$(nproc) && cmake --install /tmp/amrex/build && mkdir -p /project/.cibw-deps-cache && tar czf /project/.cibw-deps-cache/deps.tar.gz /usr/local ; fi - CIBW_BEFORE_BUILD: pip install "cmake>=3.28,<4" + # Rename the package to openimpala-cuda for the GPU wheel. + # The import name stays 'openimpala' — only the PyPI distribution name changes. + CIBW_BEFORE_BUILD: > + pip install "cmake>=3.28,<4" && + sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml # Point to MPI, CUDA, and our compiled GPU dependencies. CIBW_ENVIRONMENT_LINUX: > - PATH="/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH" + PATH="/opt/rh/gcc-toolset-13/root/usr/bin:/usr/lib64/openmpi/bin:/usr/local/cuda/bin:$PATH" + CC="/opt/rh/gcc-toolset-13/root/usr/bin/gcc" + CXX="/opt/rh/gcc-toolset-13/root/usr/bin/g++" + FC="/opt/rh/gcc-toolset-13/root/usr/bin/gfortran" CUDA_HOME="/usr/local/cuda" + CUDAHOSTCXX="/opt/rh/gcc-toolset-13/root/usr/bin/g++" CMAKE_C_COMPILER="mpicc" CMAKE_CXX_COMPILER="mpicxx" CMAKE_PREFIX_PATH="/usr/local" CMAKE_GENERATOR="Unix Makefiles" - CMAKE_ARGS="-DGPU_BACKEND=CUDA -DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90" + CMAKE_ARGS="-DGPU_BACKEND=CUDA '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++" # Vendor libraries but exclude host-specific MPI, OpenMP, Fortran runtime, # and CUDA runtime libraries (users must have CUDA toolkit installed). diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb index b130165..83866cc 100644 --- a/notebooks/profiling_and_tuning.ipynb +++ b/notebooks/profiling_and_tuning.ipynb @@ -75,7 +75,7 @@ "source": [ "# Install system MPI and Python packages\n", "!apt-get install -y libopenmpi-dev > /dev/null 2>&1\n", - "!pip install openimpala[all] > /dev/null 2>&1\n", + "!pip install openimpala-cuda[all] > /dev/null 2>&1\n", "!pip install porespy > /dev/null 2>&1\n", "print(\"Dependencies installed.\")" ] diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 42243a7..abe260e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -21,7 +21,7 @@ find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) find_package(pybind11 2.11 CONFIG REQUIRED) # --- Build the extension module --- -pybind11_add_module(_core +set(BINDING_SOURCES bindings/module.cpp bindings/enums.cpp bindings/io.cpp @@ -29,6 +29,13 @@ pybind11_add_module(_core bindings/solvers.cpp bindings/config.cpp ) +pybind11_add_module(_core ${BINDING_SOURCES}) + +# When CUDA is enabled, AMReX headers contain __host__/__device__ attributes +# that require compilation by nvcc. +if(GPU_BACKEND STREQUAL "CUDA") + set_source_files_properties(${BINDING_SOURCES} PROPERTIES LANGUAGE CUDA) +endif() target_link_libraries(_core PRIVATE openimpala_io diff --git a/python/openimpala/__init__.py b/python/openimpala/__init__.py index 105ef07..18fe931 100644 --- a/python/openimpala/__init__.py +++ b/python/openimpala/__init__.py @@ -27,7 +27,10 @@ try: __version__ = version("openimpala") except PackageNotFoundError: - __version__ = "unknown" + try: + __version__ = version("openimpala-cuda") + except PackageNotFoundError: + __version__ = "unknown" # Session context manager (pure Python — always available) from .session import Session diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 91ce955..597eb25 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,7 +7,7 @@ # IO Library # ============================================================================== # Library sources (exclude test drivers that start with 't') -add_library(openimpala_io OBJECT +set(IO_SOURCES io/CathodeWrite.cpp io/DatReader.cpp io/HDF5Reader.cpp @@ -15,6 +15,7 @@ add_library(openimpala_io OBJECT io/RawReader.cpp io/TiffReader.cpp ) +add_library(openimpala_io OBJECT ${IO_SOURCES}) target_include_directories(openimpala_io PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/io @@ -36,7 +37,7 @@ target_include_directories(openimpala_io PUBLIC # ============================================================================== # Props Library (C++ only — Fortran kernels migrated to native C++) # ============================================================================== -add_library(openimpala_props OBJECT +set(PROPS_SOURCES props/ConnectedComponents.cpp props/DeffTensor.cpp props/EffectiveDiffusivityHypre.cpp @@ -53,6 +54,7 @@ add_library(openimpala_props OBJECT props/TortuositySolverBase.cpp props/VolumeFraction.cpp ) +add_library(openimpala_props OBJECT ${PROPS_SOURCES}) target_include_directories(openimpala_props PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/props @@ -101,5 +103,19 @@ target_include_directories(Diffusion PRIVATE ${HDF5_INCLUDE_DIRS} ) +# ============================================================================== +# CUDA: compile all C++ sources as CUDA when GPU backend is enabled +# ============================================================================== +# AMReX headers use __host__/__device__ attributes (AMREX_GPU_HOST_DEVICE) +# when built with CUDA. Any translation unit that includes AMReX headers must +# therefore be compiled by nvcc. We achieve this by setting the LANGUAGE +# property to CUDA on all .cpp source files. +if(GPU_BACKEND STREQUAL "CUDA") + set_source_files_properties( + ${IO_SOURCES} ${PROPS_SOURCES} props/Diffusion.cpp + PROPERTIES LANGUAGE CUDA + ) +endif() + # Install the main executable install(TARGETS Diffusion RUNTIME DESTINATION bin) diff --git a/src/io/HDF5Reader.H b/src/io/HDF5Reader.H index 4af9eb6..579ad96 100644 --- a/src/io/HDF5Reader.H +++ b/src/io/HDF5Reader.H @@ -160,6 +160,14 @@ public: } + // --- Implementation methods (public for CUDA __device__ lambda compatibility) --- + // Template function to read a hyperslab and apply thresholding + // Needed because the native type read from file varies. + template + void readAndThresholdFab(H5::DataSet& dataset, double raw_threshold, int value_if_true, + int value_if_false, const amrex::Box& box, + amrex::IArrayBox& fab) const; + private: /** * @brief Internal implementation for reading HDF5 metadata only. @@ -169,13 +177,6 @@ private: */ bool readMetadataInternal(); - // Template function to read a hyperslab and apply thresholding - // Needed because the native type read from file varies. - template - void readAndThresholdFab(H5::DataSet& dataset, double raw_threshold, int value_if_true, - int value_if_false, const amrex::Box& box, - amrex::IArrayBox& fab) const; - // --- Member Variables --- std::string m_filename; /**< Filename of the source HDF5 file */ std::string m_hdf5dataset; /**< Path to the dataset within the HDF5 file */ diff --git a/src/props/ConnectedComponents.H b/src/props/ConnectedComponents.H index a879cd7..b8bf127 100644 --- a/src/props/ConnectedComponents.H +++ b/src/props/ConnectedComponents.H @@ -59,12 +59,13 @@ public: return m_labels; } -private: + // --- Implementation methods (public for CUDA __device__ lambda compatibility) --- void run(const amrex::iMultiFab& mf_phase, int phase_id); amrex::IntVect findNextUnlabeled(const amrex::iMultiFab& labelMF, const amrex::iMultiFab& phaseFab, int phaseID) const; +private: amrex::Geometry m_geom; amrex::BoxArray m_ba; amrex::DistributionMapping m_dm; diff --git a/src/props/ConnectedComponents.cpp b/src/props/ConnectedComponents.cpp index 96864f7..c424ec0 100644 --- a/src/props/ConnectedComponents.cpp +++ b/src/props/ConnectedComponents.cpp @@ -108,8 +108,8 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) { // Compute volume of each component using GPU-compatible atomic scatter-add m_volumes.resize(m_num_components, 0); - amrex::Gpu::DeviceVector d_volumes(m_num_components, 0); - long long* d_vol_ptr = d_volumes.data(); + amrex::Gpu::DeviceVector d_volumes(m_num_components, 0); + int* d_vol_ptr = d_volumes.data(); const int num_comp = m_num_components; for (amrex::MFIter mfi(m_labels); mfi.isValid(); ++mfi) { @@ -119,12 +119,12 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) { amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { int lbl = label_arr(i, j, k, 0); if (lbl > 0 && lbl <= num_comp) { - amrex::Gpu::Atomic::Add(&d_vol_ptr[lbl - 1], 1LL); + amrex::Gpu::Atomic::Add(&d_vol_ptr[lbl - 1], 1); } }); } - std::vector local_volumes(m_num_components); + std::vector local_volumes(m_num_components); amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_volumes.begin(), d_volumes.end(), local_volumes.begin()); @@ -132,7 +132,7 @@ void ConnectedComponents::run(const amrex::iMultiFab& mf_phase, int phase_id) { amrex::ParallelAllReduce::Sum(local_volumes.data(), m_num_components, amrex::ParallelContext::CommunicatorSub()); } - m_volumes = local_volumes; + m_volumes.assign(local_volumes.begin(), local_volumes.end()); } } // namespace OpenImpala diff --git a/src/props/EffectiveDiffusivityHypre.H b/src/props/EffectiveDiffusivityHypre.H index 5381ce1..26a3255 100644 --- a/src/props/EffectiveDiffusivityHypre.H +++ b/src/props/EffectiveDiffusivityHypre.H @@ -99,6 +99,9 @@ public: void getChiSolution(amrex::MultiFab& chi_field); + // --- Public for CUDA __device__ lambda compatibility --- + void initializeDiffCoeff(); + // --- Public Getters for Status and Solver Information --- // Solver statistics are inherited from HypreStructSolver: // getSolverConverged(), getFinalRelativeResidualNorm(), getSolverIterations() diff --git a/src/props/EffectiveDiffusivityHypre.cpp b/src/props/EffectiveDiffusivityHypre.cpp index 38ee8e8..7218f7f 100644 --- a/src/props/EffectiveDiffusivityHypre.cpp +++ b/src/props/EffectiveDiffusivityHypre.cpp @@ -164,31 +164,7 @@ EffectiveDiffusivityHypre::EffectiveDiffusivityHypre( // Build coefficient MultiFab using a device-accessible lookup table m_mf_diff_coeff.setVal(0.0); - { - int max_pid = 0; - for (const auto& kv : m_phase_coeff_map) { - max_pid = std::max(max_pid, kv.first); - } - amrex::Gpu::DeviceVector d_coeff_lut(max_pid + 1, 0.0); - amrex::Gpu::HostVector h_coeff_lut(max_pid + 1, 0.0); - for (const auto& kv : m_phase_coeff_map) { - h_coeff_lut[kv.first] = kv.second; - } - amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(), - d_coeff_lut.begin()); - const amrex::Real* lut_ptr = d_coeff_lut.data(); - const int lut_size = max_pid + 1; - - for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { - const amrex::Box& bx = mfi.growntilebox(); - amrex::Array4 const dc_arr = m_mf_diff_coeff.array(mfi); - amrex::Array4 const phase_arr = m_mf_phase_original.const_array(mfi); - amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { - int pid = phase_arr(i, j, k, 0); - dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0; - }); - } - } + initializeDiffCoeff(); m_mf_diff_coeff.FillBoundary(m_geom.periodicity()); m_mf_active_mask.setVal(cell_inactive); @@ -235,6 +211,32 @@ EffectiveDiffusivityHypre::EffectiveDiffusivityHypre( // Destructor is defaulted in the header — base class handles HYPRE cleanup. +void EffectiveDiffusivityHypre::initializeDiffCoeff() { + int max_pid = 0; + for (const auto& kv : m_phase_coeff_map) { + max_pid = std::max(max_pid, kv.first); + } + amrex::Gpu::DeviceVector d_coeff_lut(max_pid + 1, 0.0); + amrex::Gpu::HostVector h_coeff_lut(max_pid + 1, 0.0); + for (const auto& kv : m_phase_coeff_map) { + h_coeff_lut[kv.first] = kv.second; + } + amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(), + d_coeff_lut.begin()); + const amrex::Real* lut_ptr = d_coeff_lut.data(); + const int lut_size = max_pid + 1; + + for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.growntilebox(); + amrex::Array4 const dc_arr = m_mf_diff_coeff.array(mfi); + amrex::Array4 const phase_arr = m_mf_phase_original.const_array(mfi); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + int pid = phase_arr(i, j, k, 0); + dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0; + }); + } +} + void EffectiveDiffusivityHypre::generateActiveMask() { BL_PROFILE("EffectiveDiffusivityHypre::generateActiveMask"); @@ -711,7 +713,8 @@ void EffectiveDiffusivityHypre::getChiSolution(amrex::MultiFab& chi_field) { soln_buffer.data()); if (get_ierr != 0) { amrex::Warning("HYPRE_StructVectorGetBoxValues failed during getChiSolution!"); - chi_field[mfi].setVal(0.0, bx_getsol, ChiComp, numComponentsChi); + chi_field[mfi].template setVal(0.0, bx_getsol, ChiComp, + numComponentsChi); continue; } diff --git a/src/props/FloodFill.cpp b/src/props/FloodFill.cpp index 5f30766..7cf2c49 100644 --- a/src/props/FloodFill.cpp +++ b/src/props/FloodFill.cpp @@ -177,7 +177,10 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa // Reset device flag #ifdef AMREX_USE_GPU - d_changed = amrex::Gpu::DeviceScalar(0); + { + int zero = 0; + amrex::Gpu::htod_memcpy(d_changed.dataPtr(), &zero, sizeof(int)); + } d_flag_ptr = d_changed.dataPtr(); #else h_changed = 0; diff --git a/src/props/PercolationCheck.H b/src/props/PercolationCheck.H index 5d27f0a..f67a3a8 100644 --- a/src/props/PercolationCheck.H +++ b/src/props/PercolationCheck.H @@ -56,9 +56,10 @@ public: /** @brief Returns a human-readable direction string ("X", "Y", or "Z"). */ static std::string directionString(OpenImpala::Direction dir); -private: + // --- Implementation methods (public for CUDA __device__ lambda compatibility) --- void run(const amrex::iMultiFab& mf_phase, int phase_id, OpenImpala::Direction dir); +private: amrex::Geometry m_geom; amrex::BoxArray m_ba; amrex::DistributionMapping m_dm; diff --git a/src/props/REVStudy.cpp b/src/props/REVStudy.cpp index 2d4ae7d..20f2aa9 100644 --- a/src/props/REVStudy.cpp +++ b/src/props/REVStudy.cpp @@ -131,7 +131,7 @@ void runREVStudy(const amrex::Geometry& geom_full, const amrex::BoxArray& ba_ful const amrex::Box& dest_box = mfi.validbox(); amrex::Box src_box = dest_box; src_box.shift(bx_rev.smallEnd()); - dest_fab.copy(src_fab, src_box, 0, dest_box, 0, 1); + dest_fab.template copy(src_fab, src_box, 0, dest_box, 0, 1); } mf_phase_rev.FillBoundary(geom_rev.periodicity()); diff --git a/src/props/ThroughThicknessProfile.H b/src/props/ThroughThicknessProfile.H index 2d200c2..a4cc428 100644 --- a/src/props/ThroughThicknessProfile.H +++ b/src/props/ThroughThicknessProfile.H @@ -54,6 +54,10 @@ public: return static_cast(m_vf_profile.size()); } + /** @brief Compute the profile (public for CUDA __device__ lambda compatibility). */ + void compute(const amrex::Geometry& geom, const amrex::iMultiFab& mf_phase, int phase_id, + OpenImpala::Direction dir, int comp); + private: std::vector m_vf_profile; }; diff --git a/src/props/ThroughThicknessProfile.cpp b/src/props/ThroughThicknessProfile.cpp index 81bb6b3..8e6c24c 100644 --- a/src/props/ThroughThicknessProfile.cpp +++ b/src/props/ThroughThicknessProfile.cpp @@ -12,6 +12,11 @@ namespace OpenImpala { ThroughThicknessProfile::ThroughThicknessProfile(const amrex::Geometry& geom, const amrex::iMultiFab& mf_phase, int phase_id, OpenImpala::Direction dir, int comp) { + compute(geom, mf_phase, phase_id, dir, comp); +} + +void ThroughThicknessProfile::compute(const amrex::Geometry& geom, const amrex::iMultiFab& mf_phase, + int phase_id, OpenImpala::Direction dir, int comp) { AMREX_ALWAYS_ASSERT_WITH_MESSAGE(comp >= 0 && comp < mf_phase.nComp(), "ThroughThicknessProfile: Component index out of bounds."); @@ -20,10 +25,10 @@ ThroughThicknessProfile::ThroughThicknessProfile(const amrex::Geometry& geom, const int n_slices = domain.length(idir); // GPU-compatible per-slice accumulation using device vectors with atomic scatter-add - amrex::Gpu::DeviceVector d_slice_phase(n_slices, 0); - amrex::Gpu::DeviceVector d_slice_total(n_slices, 0); - long long* d_phase_ptr = d_slice_phase.data(); - long long* d_total_ptr = d_slice_total.data(); + amrex::Gpu::DeviceVector d_slice_phase(n_slices, 0); + amrex::Gpu::DeviceVector d_slice_total(n_slices, 0); + int* d_phase_ptr = d_slice_phase.data(); + int* d_total_ptr = d_slice_total.data(); const int target_phase = phase_id; const int phase_comp = comp; @@ -36,16 +41,16 @@ ThroughThicknessProfile::ThroughThicknessProfile(const amrex::Geometry& geom, amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { int coord = (idir == 0) ? i : (idir == 1) ? j : k; int idx = coord - slice_lo; - amrex::Gpu::Atomic::Add(&d_total_ptr[idx], 1LL); + amrex::Gpu::Atomic::Add(&d_total_ptr[idx], 1); if (fab(i, j, k) == target_phase) { - amrex::Gpu::Atomic::Add(&d_phase_ptr[idx], 1LL); + amrex::Gpu::Atomic::Add(&d_phase_ptr[idx], 1); } }); } // Copy device results to host - std::vector slice_phase(n_slices); - std::vector slice_total(n_slices); + std::vector slice_phase(n_slices); + std::vector slice_total(n_slices); amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_slice_phase.begin(), d_slice_phase.end(), slice_phase.begin()); amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_slice_total.begin(), d_slice_total.end(), diff --git a/src/props/TortuosityDirect.H b/src/props/TortuosityDirect.H index e7728d0..9f6678c 100644 --- a/src/props/TortuosityDirect.H +++ b/src/props/TortuosityDirect.H @@ -116,73 +116,15 @@ public: } -private: - /** @brief Executes the iterative solver loop until convergence or max iterations. Returns true - * if converged. */ + // --- Implementation methods (public for CUDA __device__ lambda compatibility) --- bool solve(); - - /** - * @brief Computes the global fluxes entering and exiting the domain boundaries. - * Expected to be called after solve() has successfully completed. - * @param fxin Output: Total flux entering the domain through the low boundary face. - * @param fxout Output: Total flux exiting the domain through the high boundary face. - */ void global_fluxes(amrex::Real& fxin, amrex::Real& fxout) const; - - /** - * @brief Computes a residual norm (e.g., L1 norm of change) between two solution states. - * Used as a convergence check in the iterative solver. - * @param phiold MultiFab containing the solution from the previous iteration. - * @param phinew MultiFab containing the solution from the current iteration. - * @return The calculated residual norm. - */ amrex::Real residual(const amrex::MultiFab& phiold, const amrex::MultiFab& phinew) const; - - /** @brief Sets up the m_bc member variable describing boundary condition types (e.g., - * Dirichlet, Neumann). */ - // void initializeBoundaryConditions() ; // FIX 2: Corrected spelling - void initializeBoundaryConditions(); // <<< FIX 2: Corrected spelling >>> - - /** @brief Creates and initializes the m_flux MultiFab array for storing face fluxes. */ - // void initializeFluxMultiFabs(); // FIX 2: Corrected spelling - void initializeFluxMultiFabs(); // <<< FIX 2: Corrected spelling >>> - - /** - * @brief Applies boundary conditions to the ghost cells of the potential MultiFab for a - * specific component. Uses the geometry and m_bc settings. Primarily handles external - * Dirichlet. - * @param phi The MultiFab whose ghost cells need filling based on BCs. - * @param comp The component index to fill boundary conditions for. - */ + void initializeBoundaryConditions(); + void initializeFluxMultiFabs(); void fillDomainBoundary(amrex::MultiFab& phi, int comp); - - - /** - * @brief Fills a component of a MultiFab with CellType information (e.g., 0=blocked, 1=free). - * Based on the input phase data (m_mf_phase). - * @param phi The MultiFab to fill (typically fills component comp_ct). - * Requires at least 2 components. - */ - void fillCellTypes(amrex::MultiFab& phi); // Fills component comp_ct - - /** - * @brief Sets the initial guess for the potential field (phi component). - * Often a linear interpolation between boundary values (m_vlo, m_vhi) in the specified - * direction. - * @param phi The MultiFab to fill with the initial state (modifies comp_phi). - */ - void fillInitialState(amrex::MultiFab& phi); // Fills component comp_phi - - - /** - * @brief Performs one iteration of the Finite Volume solver (e.g., Jacobi, SOR). - * Calculates face fluxes (updates m_flux) based on phi_old. - * Updates the potential field phi_new based on conservation laws using calculated fluxes. - * @param phi_old Input: Potential field from the previous iteration (requires filled ghost - * cells). - * @param phi_new Output: Updated potential field for the current iteration. - * @param dt Timestep or relaxation parameter used in the update. - */ + void fillCellTypes(amrex::MultiFab& phi); + void fillInitialState(amrex::MultiFab& phi); void advance(amrex::MultiFab& phi_old, amrex::MultiFab& phi_new, const amrex::Real& dt); diff --git a/src/props/TortuosityHypre.H b/src/props/TortuosityHypre.H index a3e5f4b..ac99b57 100644 --- a/src/props/TortuosityHypre.H +++ b/src/props/TortuosityHypre.H @@ -152,10 +152,11 @@ public: } -private: - // --- Private Methods --- + // --- Implementation methods (public for CUDA __device__ lambda compatibility) --- bool solve(); void setupMatrixEquation(); + void initializeDiffCoeff(); + amrex::iMultiFab buildTraversableMask(); void preconditionPhaseFab(); void generateActivityMask(const amrex::iMultiFab& phaseFab, int phaseID, OpenImpala::Direction dir); @@ -164,6 +165,7 @@ private: void global_fluxes(); void computePlaneFluxes(const amrex::MultiFab& mf_soln); +private: // --- Member Variables --- // Configuration (solver config m_solvertype/m_eps/m_maxiter/m_verbose are in base class) std::string m_resultspath; diff --git a/src/props/TortuosityHypre.cpp b/src/props/TortuosityHypre.cpp index 796921f..777ebee 100644 --- a/src/props/TortuosityHypre.cpp +++ b/src/props/TortuosityHypre.cpp @@ -216,32 +216,7 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const if (m_verbose > 1 && amrex::ParallelDescriptor::IOProcessor()) amrex::Print() << "TortuosityHypre: Building diffusion coefficient field..." << std::endl; m_mf_diff_coeff.setVal(0.0); - // Flatten phase coefficient map to a device-accessible lookup table - { - int max_pid = 0; - for (const auto& kv : m_phase_coeff_map) { - max_pid = std::max(max_pid, kv.first); - } - amrex::Gpu::DeviceVector d_coeff_lut(max_pid + 1, 0.0); - amrex::Gpu::HostVector h_coeff_lut(max_pid + 1, 0.0); - for (const auto& kv : m_phase_coeff_map) { - h_coeff_lut[kv.first] = kv.second; - } - amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(), - d_coeff_lut.begin()); - const amrex::Real* lut_ptr = d_coeff_lut.data(); - const int lut_size = max_pid + 1; - - for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { - const amrex::Box& bx = mfi.growntilebox(); - amrex::Array4 const dc_arr = m_mf_diff_coeff.array(mfi); - amrex::Array4 const phase_arr = m_mf_phase.const_array(mfi); - amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { - int pid = phase_arr(i, j, k, 0); - dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0; - }); - } - } + initializeDiffCoeff(); m_mf_diff_coeff.FillBoundary(m_geom.periodicity()); // --- For multi-phase: create binary traversable mask for flood fill --- @@ -252,19 +227,7 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const << std::endl; if (m_is_multi_phase) { - // Create a temporary binary phase fab: 1 where D > 0, 0 otherwise - amrex::iMultiFab mf_binary_traversable(m_ba, m_dm, 1, m_mf_phase.nGrow()); - mf_binary_traversable.setVal(0); - for (amrex::MFIter mfi(mf_binary_traversable, amrex::TilingIfNotGPU()); mfi.isValid(); - ++mfi) { - const amrex::Box& bx = mfi.growntilebox(); - amrex::Array4 const trav_arr = mf_binary_traversable.array(mfi); - amrex::Array4 const dc_arr = m_mf_diff_coeff.const_array(mfi); - amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { - trav_arr(i, j, k, 0) = (dc_arr(i, j, k, 0) > 0.0) ? 1 : 0; - }); - } - mf_binary_traversable.FillBoundary(m_geom.periodicity()); + amrex::iMultiFab mf_binary_traversable = buildTraversableMask(); // Flood through all traversable cells (phase ID = 1 in binary fab) generateActivityMask(mf_binary_traversable, 1, m_dir); } else { @@ -303,6 +266,48 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const // Destructor is defaulted — HypreStructSolver base class handles HYPRE cleanup. +void TortuosityHypre::initializeDiffCoeff() { + // Flatten phase coefficient map to a device-accessible lookup table + int max_pid = 0; + for (const auto& kv : m_phase_coeff_map) { + max_pid = std::max(max_pid, kv.first); + } + amrex::Gpu::DeviceVector d_coeff_lut(max_pid + 1, 0.0); + amrex::Gpu::HostVector h_coeff_lut(max_pid + 1, 0.0); + for (const auto& kv : m_phase_coeff_map) { + h_coeff_lut[kv.first] = kv.second; + } + amrex::Gpu::copy(amrex::Gpu::hostToDevice, h_coeff_lut.begin(), h_coeff_lut.end(), + d_coeff_lut.begin()); + const amrex::Real* lut_ptr = d_coeff_lut.data(); + const int lut_size = max_pid + 1; + + for (amrex::MFIter mfi(m_mf_diff_coeff, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.growntilebox(); + amrex::Array4 const dc_arr = m_mf_diff_coeff.array(mfi); + amrex::Array4 const phase_arr = m_mf_phase.const_array(mfi); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + int pid = phase_arr(i, j, k, 0); + dc_arr(i, j, k, 0) = (pid >= 0 && pid < lut_size) ? lut_ptr[pid] : 0.0; + }); + } +} + +amrex::iMultiFab TortuosityHypre::buildTraversableMask() { + // Create a temporary binary phase fab: 1 where D > 0, 0 otherwise + amrex::iMultiFab mf_binary_traversable(m_ba, m_dm, 1, m_mf_phase.nGrow()); + mf_binary_traversable.setVal(0); + for (amrex::MFIter mfi(mf_binary_traversable, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.growntilebox(); + amrex::Array4 const trav_arr = mf_binary_traversable.array(mfi); + amrex::Array4 const dc_arr = m_mf_diff_coeff.const_array(mfi); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + trav_arr(i, j, k, 0) = (dc_arr(i, j, k, 0) > 0.0) ? 1 : 0; + }); + } + mf_binary_traversable.FillBoundary(m_geom.periodicity()); + return mf_binary_traversable; +} // setupGrid() and setupStencil() are now provided by HypreStructSolver base class. diff --git a/src/props/TortuosityMLMG.H b/src/props/TortuosityMLMG.H index 2e1c1c4..6bdb198 100644 --- a/src/props/TortuosityMLMG.H +++ b/src/props/TortuosityMLMG.H @@ -70,11 +70,11 @@ public: TortuosityMLMG(const TortuosityMLMG&) = delete; TortuosityMLMG& operator=(const TortuosityMLMG&) = delete; -protected: /** @brief Run the MLMG solver, populating m_mf_solution. * * Sets up MLABecLaplacian with appropriate BCs and B-coefficients, * runs MLMG V-cycle solve, and optionally writes a plotfile. + * Public for CUDA __device__ lambda compatibility. * * @return true if solver converged. */ diff --git a/src/props/TortuositySolverBase.H b/src/props/TortuositySolverBase.H index 473ac9f..c1607fa 100644 --- a/src/props/TortuositySolverBase.H +++ b/src/props/TortuositySolverBase.H @@ -112,7 +112,9 @@ public: return m_phase_coeff_map; } -protected: + // --- Implementation methods (public for CUDA __device__ lambda compatibility) --- + // These methods contain GPU device lambdas and must be publicly accessible for nvcc. + /** @brief Run the linear solver, populating m_mf_solution. * * Subclasses must: @@ -137,6 +139,7 @@ protected: void computePlaneFluxes(const amrex::MultiFab& mf_soln); void writeSolutionPlotfile(const std::string& label); +protected: // --- Grid data (stored by value to avoid dangling references) --- amrex::Geometry m_geom; amrex::BoxArray m_ba; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 524000a..09d6d5d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -29,6 +29,10 @@ set(OPENIMPALA_TEST_INCLUDES # Helper: define a test executable and register it with CTest. # If an inputs file exists in tests/inputs/.inputs, it is passed as an argument. function(openimpala_add_test TEST_NAME SOURCE_FILE) + # When CUDA is enabled, AMReX headers require nvcc compilation + if(GPU_BACKEND STREQUAL "CUDA") + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE CUDA) + endif() add_executable(${TEST_NAME} ${SOURCE_FILE}) target_link_libraries(${TEST_NAME} PRIVATE ${OPENIMPALA_TEST_LIBS}) target_include_directories(${TEST_NAME} PRIVATE ${OPENIMPALA_TEST_INCLUDES}) @@ -421,6 +425,49 @@ set_tests_properties(tDiffusion_hdf5 PROPERTIES TIMEOUT 600 ) +# ============================================================================== +# FloodFill Tests (Issue #170 — shared GPU-compatible flood fill utility) +# ============================================================================== +openimpala_add_test(tFloodFill "${CMAKE_CURRENT_SOURCE_DIR}/tFloodFill.cpp") + +# ============================================================================== +# TortuosityMLMG Tests (Issue #171 — matrix-free MLMG solver) +# +# Validates the AMReX-native MLMG solver against the same analytical +# solutions used for the HYPRE-based solver. +# ============================================================================== +openimpala_add_test(tTortuosityMLMG "${CMAKE_CURRENT_SOURCE_DIR}/tTortuosityMLMG.cpp") + +# MLMG uniform block: tau = (N-1)/N = 0.96875 +add_test( + NAME tTortuosityMLMG_uniform + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 1 + ${MPIEXEC_PREFLAGS} + $ + ${CMAKE_SOURCE_DIR}/tests/inputs/tTortuosityMLMG_uniform.inputs + amrex.verbose=0 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} +) +set_tests_properties(tTortuosityMLMG_uniform PROPERTIES + ENVIRONMENT "OMPI_ALLOW_RUN_AS_ROOT=1;OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1" + TIMEOUT 300 +) + +# MLMG Y-direction symmetry: same tau as X +add_test( + NAME tTortuosityMLMG_dirY + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 1 + ${MPIEXEC_PREFLAGS} + $ + ${CMAKE_SOURCE_DIR}/tests/inputs/tTortuosityMLMG_dirY.inputs + amrex.verbose=0 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} +) +set_tests_properties(tTortuosityMLMG_dirY PROPERTIES + ENVIRONMENT "OMPI_ALLOW_RUN_AS_ROOT=1;OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1" + TIMEOUT 300 +) + # ============================================================================== # Unit Tests (Catch2 — fast, no MPI/AMReX required) # ============================================================================== diff --git a/tests/inputs/tTortuosityMLMG_dirY.inputs b/tests/inputs/tTortuosityMLMG_dirY.inputs new file mode 100644 index 0000000..a6d51ee --- /dev/null +++ b/tests/inputs/tTortuosityMLMG_dirY.inputs @@ -0,0 +1,21 @@ +# TortuosityMLMG Test: Y Direction +# +# Same as uniform test but in Y direction — validates directional symmetry. + +domain_size = 32 +box_size = 16 +verbose = 2 +num_phases_fill = 1 +direction = Y + +expected_tau = 1.0 +tau_tolerance = 0.001 + +resultsdir = ./tTortuosityMLMG_dirY_results + +mlmg.eps = 1e-9 +mlmg.maxiter = 200 + +tortuosity.active_phases = 0 +tortuosity.phase_diffusivities = 1.0 +tortuosity.remspot_passes = 0 diff --git a/tests/inputs/tTortuosityMLMG_uniform.inputs b/tests/inputs/tTortuosityMLMG_uniform.inputs new file mode 100644 index 0000000..60d4478 --- /dev/null +++ b/tests/inputs/tTortuosityMLMG_uniform.inputs @@ -0,0 +1,27 @@ +# TortuosityMLMG Test: Uniform Dense Block +# +# Geometry: 32^3 domain, all voxels = single conducting phase (D=1.0) +# Analytical: MLMG applies Dirichlet BCs at the domain face (external boundary), +# not at cell centers like HYPRE. For a uniform medium this gives +# tau = 1.0 exactly. +# Validates: TortuosityMLMG solver + TortuositySolverBase infrastructure + +domain_size = 32 +box_size = 16 +verbose = 2 +num_phases_fill = 1 +direction = X + +expected_tau = 1.0 +tau_tolerance = 0.001 + +resultsdir = ./tTortuosityMLMG_uniform_results + +# --- MLMG Solver Controls --- +mlmg.eps = 1e-9 +mlmg.maxiter = 200 + +# --- Multi-phase configuration --- +tortuosity.active_phases = 0 +tortuosity.phase_diffusivities = 1.0 +tortuosity.remspot_passes = 0 diff --git a/tests/tFloodFill.cpp b/tests/tFloodFill.cpp new file mode 100644 index 0000000..6cca044 --- /dev/null +++ b/tests/tFloodFill.cpp @@ -0,0 +1,310 @@ +// tests/tFloodFill.cpp +// +// Direct tests for the shared FloodFill utility (FloodFill.H/cpp). +// +// Validates: +// 1. Full flood on uniform domain — all cells reachable from a single seed +// 2. Partial flood — two isolated blocks, only seeded block is reached +// 3. collectBoundarySeeds — correct inlet/outlet seed collection +// 4. Multi-label flood — two separate floods with distinct labels + +#include "FloodFill.H" +#include "Tortuosity.H" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace { + +struct TestStatus { + bool passed = true; + std::string fail_reason; + + void recordFail(const std::string& reason) { + if (passed) { + passed = false; + fail_reason = reason; + } + } +}; + +} // anonymous namespace + + +int main(int argc, char* argv[]) { + amrex::Initialize(argc, argv); + { + TestStatus status; + int verbose = 1; + int domain_size = 16; + int box_size = 8; + + { + amrex::ParmParse pp; + pp.query("verbose", verbose); + pp.query("domain_size", domain_size); + pp.query("box_size", box_size); + } + + const int N = domain_size; + + // Setup geometry and grid + amrex::Box domain_box(amrex::IntVect(0, 0, 0), amrex::IntVect(N - 1, N - 1, N - 1)); + amrex::RealBox rb({AMREX_D_DECL(0.0, 0.0, 0.0)}, + {AMREX_D_DECL(amrex::Real(N), amrex::Real(N), amrex::Real(N))}); + amrex::Array is_periodic{AMREX_D_DECL(0, 0, 0)}; + amrex::Geometry geom; + geom.define(domain_box, &rb, 0, is_periodic.data()); + + amrex::BoxArray ba(domain_box); + ba.maxSize(box_size); + amrex::DistributionMapping dm(ba); + + // ================================================================ + // Test 1: Full flood on uniform domain + // ================================================================ + if (status.passed) { + amrex::iMultiFab mf_phase(ba, dm, 1, 1); + mf_phase.setVal(0); // all phase 0 + mf_phase.FillBoundary(geom.periodicity()); + + amrex::iMultiFab mask(ba, dm, 1, 1); + mask.setVal(OpenImpala::FLOOD_INACTIVE); + + amrex::Vector seeds = {amrex::IntVect(0, 0, 0)}; + OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds, geom, verbose); + + // Count reachable cells + long long reached = 0; + for (amrex::MFIter mfi(mask); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.validbox(); + const auto arr = mask.const_array(mfi); + amrex::LoopOnCpu(bx, [&](int i, int j, int k) { + if (arr(i, j, k, 0) == OpenImpala::FLOOD_ACTIVE) { + reached++; + } + }); + } + amrex::ParallelAllReduce::Sum(reached, amrex::ParallelContext::CommunicatorSub()); + + long long expected = static_cast(N) * N * N; + if (reached != expected) { + status.recordFail("Test 1 (full flood): reached=" + std::to_string(reached) + + ", expected=" + std::to_string(expected)); + } + if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Test 1 (full flood): PASS (" << reached << "/" << expected + << " cells)\n"; + } + } + + // ================================================================ + // Test 2: Partial flood — two isolated blocks + // ================================================================ + if (status.passed) { + amrex::iMultiFab mf_phase(ba, dm, 1, 1); + mf_phase.setVal(1); // background = phase 1 + + int cube_size = 3; +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for (amrex::MFIter mfi(mf_phase, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.growntilebox(); + auto arr = mf_phase.array(mfi); + amrex::LoopOnCpu(bx, [&](int i, int j, int k) { + // Block A at corner (1,1,1) + if (i >= 1 && i < 1 + cube_size && j >= 1 && j < 1 + cube_size && k >= 1 && + k < 1 + cube_size) { + arr(i, j, k, 0) = 0; + } + // Block B at far corner, separated by gap + int lo2 = N - 1 - cube_size; + if (i >= lo2 && i < lo2 + cube_size && j >= lo2 && j < lo2 + cube_size && + k >= lo2 && k < lo2 + cube_size) { + arr(i, j, k, 0) = 0; + } + }); + } + mf_phase.FillBoundary(geom.periodicity()); + + // Seed only in block A + amrex::iMultiFab mask(ba, dm, 1, 1); + mask.setVal(OpenImpala::FLOOD_INACTIVE); + + amrex::Vector seeds = {amrex::IntVect(1, 1, 1)}; + OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds, geom, verbose); + + long long reached = 0; + for (amrex::MFIter mfi(mask); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.validbox(); + const auto arr = mask.const_array(mfi); + amrex::LoopOnCpu(bx, [&](int i, int j, int k) { + if (arr(i, j, k, 0) == OpenImpala::FLOOD_ACTIVE) { + reached++; + } + }); + } + amrex::ParallelAllReduce::Sum(reached, amrex::ParallelContext::CommunicatorSub()); + + long long expected_block_vol = + static_cast(cube_size) * cube_size * cube_size; + if (reached != expected_block_vol) { + status.recordFail("Test 2 (partial flood): reached=" + std::to_string(reached) + + ", expected=" + std::to_string(expected_block_vol)); + } + if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Test 2 (partial flood): PASS (" << reached << " cells, " + << "only block A)\n"; + } + } + + // ================================================================ + // Test 3: collectBoundarySeeds + // ================================================================ + if (status.passed) { + amrex::iMultiFab mf_phase(ba, dm, 1, 1); + mf_phase.setVal(0); // all phase 0 + mf_phase.FillBoundary(geom.periodicity()); + + amrex::Vector inletSeeds, outletSeeds; + OpenImpala::collectBoundarySeeds(mf_phase, 0, 0 /* X direction */, geom, inletSeeds, + outletSeeds); + + // For X direction: inlet = i=0 face, outlet = i=N-1 face + // Each face has N*N cells + long long expected_seeds = static_cast(N) * N; + if (static_cast(inletSeeds.size()) != expected_seeds) { + status.recordFail( + "Test 3 (boundary seeds): inlet seeds=" + std::to_string(inletSeeds.size()) + + ", expected=" + std::to_string(expected_seeds)); + } + if (static_cast(outletSeeds.size()) != expected_seeds) { + status.recordFail( + "Test 3 (boundary seeds): outlet seeds=" + std::to_string(outletSeeds.size()) + + ", expected=" + std::to_string(expected_seeds)); + } + + // Verify all inlet seeds have i=0 + for (const auto& s : inletSeeds) { + if (s[0] != 0) { + status.recordFail("Test 3 (boundary seeds): inlet seed with i=" + + std::to_string(s[0]) + " (expected 0)"); + break; + } + } + // Verify all outlet seeds have i=N-1 + for (const auto& s : outletSeeds) { + if (s[0] != N - 1) { + status.recordFail( + "Test 3 (boundary seeds): outlet seed with i=" + std::to_string(s[0]) + + " (expected " + std::to_string(N - 1) + ")"); + break; + } + } + + if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Test 3 (boundary seeds): PASS (inlet=" << inletSeeds.size() + << ", outlet=" << outletSeeds.size() << ")\n"; + } + } + + // ================================================================ + // Test 4: Multi-label flood (two labels on same mask) + // ================================================================ + if (status.passed) { + amrex::iMultiFab mf_phase(ba, dm, 1, 1); + mf_phase.setVal(1); // background + + int cube_size = 3; +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for (amrex::MFIter mfi(mf_phase, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.growntilebox(); + auto arr = mf_phase.array(mfi); + amrex::LoopOnCpu(bx, [&](int i, int j, int k) { + if (i >= 1 && i < 1 + cube_size && j >= 1 && j < 1 + cube_size && k >= 1 && + k < 1 + cube_size) { + arr(i, j, k, 0) = 0; + } + int lo2 = N - 1 - cube_size; + if (i >= lo2 && i < lo2 + cube_size && j >= lo2 && j < lo2 + cube_size && + k >= lo2 && k < lo2 + cube_size) { + arr(i, j, k, 0) = 0; + } + }); + } + mf_phase.FillBoundary(geom.periodicity()); + + amrex::iMultiFab mask(ba, dm, 1, 1); + mask.setVal(OpenImpala::FLOOD_INACTIVE); + + // Flood block A with label=1 + amrex::Vector seeds_a = {amrex::IntVect(1, 1, 1)}; + OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds_a, geom, verbose, 1); + + // Flood block B with label=2 + int lo2 = N - 1 - cube_size; + amrex::Vector seeds_b = {amrex::IntVect(lo2, lo2, lo2)}; + OpenImpala::parallelFloodFill(mask, mf_phase, 0, seeds_b, geom, verbose, 2); + + long long count_1 = 0, count_2 = 0; + for (amrex::MFIter mfi(mask); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.validbox(); + const auto arr = mask.const_array(mfi); + amrex::LoopOnCpu(bx, [&](int i, int j, int k) { + if (arr(i, j, k, 0) == 1) + count_1++; + if (arr(i, j, k, 0) == 2) + count_2++; + }); + } + amrex::ParallelAllReduce::Sum(count_1, amrex::ParallelContext::CommunicatorSub()); + amrex::ParallelAllReduce::Sum(count_2, amrex::ParallelContext::CommunicatorSub()); + + long long expected_vol = static_cast(cube_size) * cube_size * cube_size; + if (count_1 != expected_vol) { + status.recordFail("Test 4 (multi-label): label 1 count=" + std::to_string(count_1) + + ", expected=" + std::to_string(expected_vol)); + } + if (count_2 != expected_vol) { + status.recordFail("Test 4 (multi-label): label 2 count=" + std::to_string(count_2) + + ", expected=" + std::to_string(expected_vol)); + } + + if (status.passed && verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Test 4 (multi-label): PASS (label1=" << count_1 + << ", label2=" << count_2 << ")\n"; + } + } + + // ================================================================ + // Final summary + // ================================================================ + if (amrex::ParallelDescriptor::IOProcessor()) { + if (status.passed) { + amrex::Print() << "\n--- TEST RESULT: PASS ---\n"; + } else { + amrex::Print() << "\n--- TEST RESULT: FAIL ---\n"; + amrex::Print() << " Reason: " << status.fail_reason << "\n"; + } + } + + if (!status.passed) { + amrex::Abort("tFloodFill Test FAILED."); + } + } + amrex::Finalize(); + return 0; +} diff --git a/tests/tTortuosityMLMG.cpp b/tests/tTortuosityMLMG.cpp new file mode 100644 index 0000000..63403bf --- /dev/null +++ b/tests/tTortuosityMLMG.cpp @@ -0,0 +1,219 @@ +// tests/tTortuosityMLMG.cpp +// +// Validates the TortuosityMLMG matrix-free solver against known analytical +// solutions. Uses the same synthetic geometry approach as tMultiPhaseTransport. +// +// Test cases (selected via inputs): +// uniform: All cells = phase 0, tau = (N-1)/N +// twophase: Alternating layers with equal D, tau = (N-1)/N + +#include "TortuosityMLMG.H" +#include "Tortuosity.H" +#include "SolverConfig.H" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +int main(int argc, char* argv[]) { + amrex::Initialize(argc, argv); + { + amrex::Real strt_time = amrex::second(); + bool test_passed = true; + std::string fail_reason; + + // --- Configuration via ParmParse --- + int domain_size = 32; + int box_size = 16; + int verbose = 1; + int num_phases_fill = 1; + std::string direction_str = "X"; + amrex::Real expected_tau = 1.0; + amrex::Real tau_tolerance = 1e-3; + std::string resultsdir = "./tTortuosityMLMG_results"; + + { + amrex::ParmParse pp; + pp.query("domain_size", domain_size); + pp.query("box_size", box_size); + pp.query("verbose", verbose); + pp.query("num_phases_fill", num_phases_fill); + pp.query("direction", direction_str); + pp.query("expected_tau", expected_tau); + pp.query("tau_tolerance", tau_tolerance); + pp.query("resultsdir", resultsdir); + } + + OpenImpala::Direction direction = OpenImpala::parseDirection(direction_str); + + if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << "\n--- TortuosityMLMG Test ---\n"; + amrex::Print() << " Domain Size: " << domain_size << "^3\n"; + amrex::Print() << " Direction: " << direction_str << "\n"; + amrex::Print() << " Expected Tau: " << expected_tau << "\n"; + amrex::Print() << " Tau Tolerance: " << tau_tolerance << "\n"; + amrex::Print() << "-------------------------------\n\n"; + } + + // --- Create synthetic domain --- + amrex::Box domain_box(amrex::IntVect(0, 0, 0), + amrex::IntVect(domain_size - 1, domain_size - 1, domain_size - 1)); + amrex::RealBox rb({AMREX_D_DECL(0.0, 0.0, 0.0)}, + {AMREX_D_DECL(amrex::Real(domain_size), amrex::Real(domain_size), + amrex::Real(domain_size))}); + amrex::Array is_periodic{AMREX_D_DECL(0, 0, 0)}; + amrex::Geometry geom; + geom.define(domain_box, &rb, 0, is_periodic.data()); + + amrex::BoxArray ba(domain_box); + ba.maxSize(box_size); + amrex::DistributionMapping dm(ba); + + // --- Create and fill phase field --- + amrex::iMultiFab mf_phase(ba, dm, 1, 1); + + if (num_phases_fill == 1) { + mf_phase.setVal(0); + } else { + // Alternating layers along X +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for (amrex::MFIter mfi(mf_phase, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { + const amrex::Box& bx = mfi.growntilebox(); + amrex::Array4 const phase_arr = mf_phase.array(mfi); + int dir_idx = static_cast(direction); + amrex::LoopOnCpu(bx, [&](int i, int j, int k) { + int coord = (dir_idx == 0) ? i : (dir_idx == 1) ? j : k; + phase_arr(i, j, k, 0) = (coord % 2 == 0) ? 0 : 1; + }); + } + } + mf_phase.FillBoundary(geom.periodicity()); + + amrex::Real vf = 1.0; + + // --- Create results directory --- + if (!resultsdir.empty() && amrex::ParallelDescriptor::IOProcessor()) { + amrex::UtilCreateDirectory(resultsdir, 0755); + } + amrex::ParallelDescriptor::Barrier(); + + // --- Construct and run TortuosityMLMG --- + std::unique_ptr tort; + try { + tort = std::make_unique( + geom, ba, dm, mf_phase, vf, 0 /* phase_id */, direction, resultsdir, 0.0 /* vlo */, + 1.0 /* vhi */, verbose, false /* write_plotfile */); + } catch (const std::exception& e) { + test_passed = false; + fail_reason = "TortuosityMLMG construction failed: " + std::string(e.what()); + } + + // --- Calculate tortuosity --- + amrex::Real actual_tau = std::numeric_limits::quiet_NaN(); + if (test_passed && tort) { + try { + actual_tau = tort->value(); + if (std::isnan(actual_tau) || std::isinf(actual_tau)) { + test_passed = false; + fail_reason = "Tortuosity value is NaN or Inf"; + } + } catch (const std::exception& e) { + test_passed = false; + fail_reason = "Exception during solve: " + std::string(e.what()); + } + } + + // --- Check solver convergence --- + if (test_passed && tort) { + if (!tort->getSolverConverged()) { + test_passed = false; + fail_reason = "MLMG solver did not converge (residual=" + + std::to_string(tort->getFinalRelativeResidualNorm()) + ")"; + } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Solver convergence: PASS (" << tort->getSolverIterations() + << " iterations, residual=" << tort->getFinalRelativeResidualNorm() + << ")\n"; + } + } + + // --- Validate tortuosity against expected value --- + if (test_passed) { + amrex::Real diff = std::abs(actual_tau - expected_tau); + if (diff > tau_tolerance) { + test_passed = false; + fail_reason = "Tortuosity mismatch. Expected: " + std::to_string(expected_tau) + + ", Got: " + std::to_string(actual_tau) + + ", Diff: " + std::to_string(diff); + } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Tortuosity value check: PASS (tau=" << actual_tau + << ", expected=" << expected_tau << ", diff=" << diff << ")\n"; + } + } + + // --- Check active volume fraction --- + if (test_passed && tort) { + amrex::Real active_vf = tort->getActiveVolumeFraction(); + if (active_vf < 0.99) { + test_passed = false; + fail_reason = + "Active volume fraction unexpectedly low: " + std::to_string(active_vf); + } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Active VF check: PASS (active_vf=" << active_vf + << ")\n"; + } + } + + // --- Check plane flux conservation --- + if (test_passed && tort) { + const auto& plane_fluxes = tort->getPlaneFluxes(); + amrex::Real max_dev = tort->getPlaneFluxMaxDeviation(); + constexpr amrex::Real plane_flux_tol = 1.0e-6; + + if (!plane_fluxes.empty() && max_dev > plane_flux_tol) { + test_passed = false; + fail_reason = + "Plane flux conservation failed. Max deviation: " + std::to_string(max_dev); + } else if (verbose >= 1 && amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << " Plane flux conservation: PASS (" << plane_fluxes.size() + << " faces, max_dev=" << std::scientific << max_dev + << std::defaultfloat << ")\n"; + } + } + + // --- Final summary --- + amrex::Real stop_time = amrex::second() - strt_time; + amrex::ParallelDescriptor::ReduceRealMax(stop_time, + amrex::ParallelDescriptor::IOProcessorNumber()); + + if (amrex::ParallelDescriptor::IOProcessor()) { + amrex::Print() << "\n Run time = " << stop_time << " sec\n"; + if (test_passed) { + amrex::Print() << "\n--- TEST RESULT: PASS ---\n"; + } else { + amrex::Print() << "\n--- TEST RESULT: FAIL ---\n"; + amrex::Print() << " Reason: " << fail_reason << "\n"; + } + } + + if (!test_passed) { + amrex::Abort("TortuosityMLMG Test FAILED."); + } + } + amrex::Finalize(); + return 0; +} diff --git a/tutorials/01_hello_openimpala.ipynb b/tutorials/01_hello_openimpala.ipynb index 9968cad..38b4403 100644 --- a/tutorials/01_hello_openimpala.ipynb +++ b/tutorials/01_hello_openimpala.ipynb @@ -10,7 +10,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Install OpenImpala, PoreSpy for structure generation, and Matplotlib for plots.\n!pip install -q openimpala porespy matplotlib" + "source": "# Install OpenImpala, PoreSpy for structure generation, and Matplotlib for plots.\n!pip install -q openimpala-cuda porespy matplotlib" }, { "cell_type": "code", diff --git a/tutorials/02_digital_twin.ipynb b/tutorials/02_digital_twin.ipynb index 6277621..7446340 100644 --- a/tutorials/02_digital_twin.ipynb +++ b/tutorials/02_digital_twin.ipynb @@ -12,7 +12,7 @@ "outputs": [], "source": [ "# Install OpenImpala, PyBaMM, and visualization utilities\n", - "!pip install -q openimpala pybamm bpx tifffile matplotlib yt" + "!pip install -q openimpala-cuda pybamm bpx tifffile matplotlib yt" ] }, { diff --git a/tutorials/03_rev_and_uncertainty.ipynb b/tutorials/03_rev_and_uncertainty.ipynb index 0848851..9d7033c 100644 --- a/tutorials/03_rev_and_uncertainty.ipynb +++ b/tutorials/03_rev_and_uncertainty.ipynb @@ -10,7 +10,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Install OpenImpala and utilities\n!pip install -q openimpala tifffile matplotlib scipy" + "source": "# Install OpenImpala and utilities\n!pip install -q openimpala-cuda tifffile matplotlib scipy" }, { "cell_type": "code", diff --git a/tutorials/04_multiphase_and_fields.ipynb b/tutorials/04_multiphase_and_fields.ipynb index 0e4c463..fc1f089 100644 --- a/tutorials/04_multiphase_and_fields.ipynb +++ b/tutorials/04_multiphase_and_fields.ipynb @@ -10,7 +10,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Install OpenImpala, PoreSpy, yt (for AMReX plotfile visualisation), and Matplotlib.\n!pip install -q openimpala porespy yt matplotlib" + "source": "# Install OpenImpala, PoreSpy, yt (for AMReX plotfile visualisation), and Matplotlib.\n!pip install -q openimpala-cuda porespy yt matplotlib" }, { "cell_type": "code", diff --git a/tutorials/05_surrogate_modelling.ipynb b/tutorials/05_surrogate_modelling.ipynb index 3505cd7..cf8082f 100644 --- a/tutorials/05_surrogate_modelling.ipynb +++ b/tutorials/05_surrogate_modelling.ipynb @@ -12,7 +12,7 @@ "outputs": [], "source": [ "# Install OpenImpala and ML libraries\n", - "!pip install -q openimpala porespy scikit-learn matplotlib" + "!pip install -q openimpala-cuda porespy scikit-learn matplotlib" ] }, { diff --git a/tutorials/06_topology_optimisation.ipynb b/tutorials/06_topology_optimisation.ipynb index b668c30..31bd69a 100644 --- a/tutorials/06_topology_optimisation.ipynb +++ b/tutorials/06_topology_optimisation.ipynb @@ -10,7 +10,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "!pip install -q openimpala matplotlib" + "source": "!pip install -q openimpala-cuda matplotlib" }, { "cell_type": "code", diff --git a/tutorials/07_hpc_scaling.ipynb b/tutorials/07_hpc_scaling.ipynb index 7c3cb77..a10b2ce 100644 --- a/tutorials/07_hpc_scaling.ipynb +++ b/tutorials/07_hpc_scaling.ipynb @@ -242,7 +242,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "!pip install -q openimpala porespy matplotlib" + "source": "!pip install -q openimpala-cuda porespy matplotlib" }, { "cell_type": "code", @@ -266,7 +266,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## Summary\n\n| Scenario | Approach |\n|----------|----------|\n| **Laptop / Colab** | `pip install openimpala`, use NumPy arrays directly |\n| **Small cluster (1-16 cores)** | `mpirun -np 16 python script.py` with NumPy loading |\n| **Large cluster (16-128+ cores)** | `mpirun -np 128 python script.py` with `oi.read_image()` for parallel I/O |\n| **HPC without Python** | `mpirun -np 128 Diffusion ./inputs` (pure C++ application) |\n\n### Solver Quick Reference\n\n| Solver | Best For | Notes |\n|--------|----------|-------|\n| **FlexGMRES** | General use | Robust default, handles non-symmetric systems |\n| **PCG** | Symmetric problems | Fastest when applicable |\n| **SMG/PFMG** | Structured grids | Geometric multigrid, excellent on regular domains |\n| **BiCGSTAB** | Non-symmetric | Alternative to GMRES |\n\nThe API is the same at every scale — only the launch command changes. The scaling study and solver comparison in Sections 6-7 provide concrete data to guide your deployment decisions.\n\n**Back to the beginning:**\n- [Tutorial 1: Getting Started](01_hello_openimpala.ipynb) — Core workflow with synthetic microstructures.\n- [Tutorial 2: From 3D Image to Device Model](02_digital_twin.ipynb) — Load real CT scans and export to PyBaMM.\n\n---\n\n## References & Further Reading\n\n1. **OpenImpala:** S. Mayner, F. Ciucci, *OpenImpala: open-source computational framework for microstructural analysis of 3D tomography data*, SoftwareX (2024). [GitHub](https://github.com/BASE-Laboratory/OpenImpala)\n2. **AMReX:** W. Zhang et al., *AMReX: a framework for block-structured adaptive mesh refinement*, J. Open Source Software 4(37), 1370 (2019). [doi:10.21105/joss.01370](https://doi.org/10.21105/joss.01370)\n3. **AMReX scaling:** A. S. Almgren et al., *Block-structured adaptive mesh refinement — theory, implementation and application*, J. Comput. Physics 332, 1-28 (2017). [doi:10.1016/j.jcp.2016.12.073](https://doi.org/10.1016/j.jcp.2016.12.073)\n4. **HYPRE:** R. D. Falgout & U. M. Yang, *hypre: A library of high performance preconditioners*, Computational Science — ICCS 2002, LNCS 2331, pp. 632-641 (2002). [doi:10.1007/3-540-47789-6_66](https://doi.org/10.1007/3-540-47789-6_66)\n5. **Parallel HDF5:** The HDF Group, *HDF5 — Parallel I/O*, [hdfgroup.org](https://www.hdfgroup.org/solutions/hdf5/)\n6. **Apptainer/Singularity for HPC:** G. M. Kurtzer et al., *Singularity: Scientific containers for mobility of compute*, PLoS ONE 12(5), e0177459 (2017). [doi:10.1371/journal.pone.0177459](https://doi.org/10.1371/journal.pone.0177459)\n7. **MPI standard:** Message Passing Interface Forum, *MPI: A Message-Passing Interface Standard, Version 4.0* (2021). [mpi-forum.org](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf)" + "source": "## Summary\n\n| Scenario | Approach |\n|----------|----------|\n| **Laptop / Colab** | `pip install openimpala-cuda`, use NumPy arrays directly |\n| **Small cluster (1-16 cores)** | `mpirun -np 16 python script.py` with NumPy loading |\n| **Large cluster (16-128+ cores)** | `mpirun -np 128 python script.py` with `oi.read_image()` for parallel I/O |\n| **HPC without Python** | `mpirun -np 128 Diffusion ./inputs` (pure C++ application) |\n\n### Solver Quick Reference\n\n| Solver | Best For | Notes |\n|--------|----------|-------|\n| **FlexGMRES** | General use | Robust default, handles non-symmetric systems |\n| **PCG** | Symmetric problems | Fastest when applicable |\n| **SMG/PFMG** | Structured grids | Geometric multigrid, excellent on regular domains |\n| **BiCGSTAB** | Non-symmetric | Alternative to GMRES |\n\nThe API is the same at every scale — only the launch command changes. The scaling study and solver comparison in Sections 6-7 provide concrete data to guide your deployment decisions.\n\n**Back to the beginning:**\n- [Tutorial 1: Getting Started](01_hello_openimpala.ipynb) — Core workflow with synthetic microstructures.\n- [Tutorial 2: From 3D Image to Device Model](02_digital_twin.ipynb) — Load real CT scans and export to PyBaMM.\n\n---\n\n## References & Further Reading\n\n1. **OpenImpala:** S. Mayner, F. Ciucci, *OpenImpala: open-source computational framework for microstructural analysis of 3D tomography data*, SoftwareX (2024). [GitHub](https://github.com/BASE-Laboratory/OpenImpala)\n2. **AMReX:** W. Zhang et al., *AMReX: a framework for block-structured adaptive mesh refinement*, J. Open Source Software 4(37), 1370 (2019). [doi:10.21105/joss.01370](https://doi.org/10.21105/joss.01370)\n3. **AMReX scaling:** A. S. Almgren et al., *Block-structured adaptive mesh refinement — theory, implementation and application*, J. Comput. Physics 332, 1-28 (2017). [doi:10.1016/j.jcp.2016.12.073](https://doi.org/10.1016/j.jcp.2016.12.073)\n4. **HYPRE:** R. D. Falgout & U. M. Yang, *hypre: A library of high performance preconditioners*, Computational Science — ICCS 2002, LNCS 2331, pp. 632-641 (2002). [doi:10.1007/3-540-47789-6_66](https://doi.org/10.1007/3-540-47789-6_66)\n5. **Parallel HDF5:** The HDF Group, *HDF5 — Parallel I/O*, [hdfgroup.org](https://www.hdfgroup.org/solutions/hdf5/)\n6. **Apptainer/Singularity for HPC:** G. M. Kurtzer et al., *Singularity: Scientific containers for mobility of compute*, PLoS ONE 12(5), e0177459 (2017). [doi:10.1371/journal.pone.0177459](https://doi.org/10.1371/journal.pone.0177459)\n7. **MPI standard:** Message Passing Interface Forum, *MPI: A Message-Passing Interface Standard, Version 4.0* (2021). [mpi-forum.org](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf)" } ], "metadata": {