From 69479dcdbcda7c2110d5fbf2f3521a0805f4b4e2 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.lehoux@gre.ac.uk>
Date: Sun, 29 Mar 2026 14:10:23 +0000
Subject: [PATCH 1/6] Add automated semantic versioning and release drafting
 (#85)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Release Drafter config with PR label → changelog category mapping
  and semantic version resolution (major/minor/patch from labels)
- Add Release Drafter workflow triggered on push to master and PR events
- Auto-label PRs by file path (physics, io, docs, devops, gpu, python)
- Sync CMake project VERSION with Git tags instead of hardcoded 0.1.0
- Add fetch-depth: 0 to release.yml checkout for tag visibility
- Update release.yml to preserve Release Drafter notes when attaching SIF

https://claude.ai/code/session_01RKnn97qiD7sbCeABHH3eQk
---
 .github/release-drafter.yml           | 133 ++++++++++++++++++++++++++
 .github/workflows/release-drafter.yml |  21 ++++
 .github/workflows/release.yml         |   8 +-
 CMakeLists.txt                        |  31 +++++-
 4 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 .github/release-drafter.yml
 create mode 100644 .github/workflows/release-drafter.yml

diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
new file mode 100644
index 0000000..acd1f3c
--- /dev/null
+++ b/.github/release-drafter.yml
@@ -0,0 +1,133 @@
+# Configuration for Release Drafter
+# https://github.com/release-drafter/release-drafter
+#
+# Automatically drafts GitHub release notes from merged PRs.
+# PRs are categorized by their labels into changelog sections.
+
+name-template: 'v$RESOLVED_VERSION'
+tag-template: 'v$RESOLVED_VERSION'
+
+# Determine the next version bump from PR labels
+version-resolver:
+  major:
+    labels:
+      - 'breaking'
+  minor:
+    labels:
+      - 'enhancement'
+      - 'feature'
+      - 'physics'
+  patch:
+    labels:
+      - 'bug'
+      - 'fix'
+      - 'performance'
+      - 'documentation'
+      - 'devops'
+      - 'dependencies'
+  default: patch
+
+# Map PR labels to changelog sections
+categories:
+  - title: '🔬 Physics & Solvers'
+    labels:
+      - 'physics'
+      - 'solver'
+  - title: '🚀 New Features'
+    labels:
+      - 'feature'
+      - 'enhancement'
+  - title: '⚡ Performance'
+    labels:
+      - 'performance'
+      - 'gpu'
+  - title: '🐛 Bug Fixes'
+    labels:
+      - 'bug'
+      - 'fix'
+  - title: '📖 Documentation'
+    labels:
+      - 'documentation'
+  - title: '🔧 DevOps & CI'
+    labels:
+      - 'devops'
+      - 'ci'
+  - title: '📦 Dependencies'
+    labels:
+      - 'dependencies'
+  - title: '⚠️ Breaking Changes'
+    labels:
+      - 'breaking'
+  - title: '🧹 Maintenance'
+    labels:
+      - 'maintenance'
+      - 'refactor'
+
+# Exclude PRs with these labels from release notes
+exclude-labels:
+  - 'skip-changelog'
+
+# Template for the release body
+template: |
+  ## What's Changed
+
+  $CHANGES
+
+  **Full Changelog**: https://github.com/$OWNER/$REPOSITORY/compare/$PREVIOUS_TAG...v$RESOLVED_VERSION
+
+# Auto-label PRs based on file paths
+autolabeler:
+  - label: 'physics'
+    files:
+      - 'src/props/**'
+    title:
+      - '/tortuosity/i'
+      - '/diffusiv/i'
+      - '/solver/i'
+      - '/HYPRE/i'
+      - '/MLMG/i'
+  - label: 'io'
+    files:
+      - 'src/io/**'
+    title:
+      - '/reader/i'
+      - '/TIFF/i'
+      - '/HDF5/i'
+  - label: 'documentation'
+    files:
+      - 'docs/**'
+      - '*.md'
+      - 'Doxyfile'
+    title:
+      - '/docs/i'
+      - '/documentation/i'
+  - label: 'devops'
+    files:
+      - '.github/**'
+      - 'containers/**'
+      - 'pyproject.toml'
+      - 'CMakeLists.txt'
+    title:
+      - '/CI/i'
+      - '/workflow/i'
+      - '/wheel/i'
+      - '/PyPI/i'
+  - label: 'gpu'
+    files:
+      - 'src/props/*GPU*'
+      - 'src/props/*CUDA*'
+    title:
+      - '/CUDA/i'
+      - '/GPU/i'
+      - '/NVCC/i'
+  - label: 'python'
+    files:
+      - 'python/**'
+    title:
+      - '/python/i'
+      - '/pybind/i'
+      - '/binding/i'
+  - label: 'tests'
+    files:
+      - 'tests/**'
+      - 'python/tests/**'
diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml
new file mode 100644
index 0000000..9d6097b
--- /dev/null
+++ b/.github/workflows/release-drafter.yml
@@ -0,0 +1,21 @@
+# .github/workflows/release-drafter.yml
+name: Release Drafter
+
+on:
+  push:
+    branches:
+      - master
+  pull_request_target:
+    types: [opened, reopened, synchronize]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  update-release-draft:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: release-drafter/release-drafter@v6
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 0c5db22..44e4f6c 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,6 +15,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Set up Apptainer
         uses: eWaterCycle/setup-apptainer@v2
@@ -121,10 +123,14 @@ jobs:
           sudo apptainer build "$SIF_FILENAME" Singularity.final.def
           echo "SIF_FILENAME=$SIF_FILENAME" >> $GITHUB_ENV
           
-      - name: Create GitHub Release and Upload SIF
+      - name: Upload SIF to GitHub Release
         uses: softprops/action-gh-release@v2
         with:
           files: ${{ env.SIF_FILENAME }}
+          # Release notes are pre-populated by Release Drafter.
+          # Only fall back to auto-generated notes if the body is empty
+          # (e.g. tag was pushed without a prior draft).
           generate_release_notes: true
+          append_body: true
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16aca61..a0e2931 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,37 @@
 
 cmake_minimum_required(VERSION 3.18)
 
+# ---------------------------------------------------------------------------
+# Derive project version from the latest Git tag (e.g. v4.0.1 → 4.0.1).
+# Falls back to 0.0.0 when building outside a Git repository or when no
+# tag is reachable (e.g. shallow clone without --tags).
+# ---------------------------------------------------------------------------
+set(OPENIMPALA_FALLBACK_VERSION "4.0.1")
+
+find_package(Git QUIET)
+if(GIT_FOUND)
+    execute_process(
+        COMMAND "${GIT_EXECUTABLE}" describe --tags --match "v[0-9]*" --abbrev=0
+        WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        OUTPUT_VARIABLE _git_tag
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+        RESULT_VARIABLE _git_result
+    )
+    if(_git_result EQUAL 0 AND _git_tag MATCHES "^v([0-9]+\\.[0-9]+\\.[0-9]+)")
+        set(_detected_version "${CMAKE_MATCH_1}")
+    endif()
+endif()
+
+if(NOT _detected_version)
+    set(_detected_version "${OPENIMPALA_FALLBACK_VERSION}")
+    message(STATUS "Git tag not found — using fallback version ${_detected_version}")
+else()
+    message(STATUS "Version from Git tag: ${_detected_version}")
+endif()
+
 project(OpenImpala
-    VERSION 0.1.0
+    VERSION ${_detected_version}
     LANGUAGES C CXX Fortran
     DESCRIPTION "Image-based simulation of transport properties in porous media"
 )

From be593adc18081af74574b708d8d53aba9a27b012 Mon Sep 17 00:00:00 2001
From: James Le Houx <37665786+jameslehoux@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:36:38 +0100
Subject: [PATCH 2/6] Create initial draft for OpenImpala paper

Add initial draft of OpenImpala paper detailing its framework and advancements.
---
 paper.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 paper.md

diff --git a/paper.md b/paper.md
new file mode 100644
index 0000000..1aa428e
--- /dev/null
+++ b/paper.md
@@ -0,0 +1,64 @@
+---
+title: 'OpenImpala: A High-Performance, Image-Based Microstructural Parameterisation Engine for Porous Media'
+tags:
+  - C++
+  - Python
+  - high-performance computing
+  - porous media
+  - battery modelling
+  - AMReX
+  - tortuosity
+authors:
+  - name: James Le Houx
+    orcid: 0000-0002-1576-0673
+    corresponding: true
+    email: james.le-houx@stfc.ac.uk
+    affiliation: "1, 2, 3"
+  - name: Denis Kramer
+    orcid: 0000-0003-0605-1047 
+    affiliation: 4
+affiliations:
+ - name: University of Greenwich, Old Royal Naval College, Park Row, London, SE10 9LS, United Kingdom
+   index: 1
+ - name: ISIS Neutron & Muon Source, Rutherford Appleton Laboratory, Didcot, OX11 0QX, United Kingdom
+   index: 2
+ - name: The Faraday Institution, Harwell Science and Innovation Campus, Didcot, OX11 0RA, United Kingdom
+   index: 3
+ - name: Helmut Schmidt University, Hamburg, Germany
+   index: 4
+date: 29 March 2026
+bibliography: paper.bib
+---
+# Summary
+
+`OpenImpala` is a high-performance computing (HPC) framework built upon the AMReX library, designed for the direct evaluation of effective transport properties—such as tortuosity and effective electrical conductivity—from three-dimensional microstructural imaging data (e.g., X-ray Computed Tomography). By formulating the discrete governing equations directly on Cartesian voxel grids, `OpenImpala` circumvents the computationally prohibitive mesh-generation bottlenecks typically associated with conventional finite element methods (FEM). The framework couples a highly scalable, distributed-memory C++ computational backend with a flexible Python interface, facilitating the automated parameterisation of complex porous media for downstream systems-level continuum modelling.
+
+# Statement of Need
+
+High-resolution three-dimensional microstructural imaging is increasingly utilized across battery research, geosciences, and materials engineering to characterize internal transport phenomena. However, the extraction of bulk effective physical parameters from billion-voxel datasets represents a significant computational challenge. Conventional finite element tools often encounter tractability limits during the mesh-generation phase when applied to complex, highly tortuous microstructures. Conversely, existing open-source voxel-based solvers, while highly accessible, generally lack the distributed-memory Message Passing Interface (MPI) scalability required to process massive, out-of-core datasets on HPC architectures. 
+
+`OpenImpala` addresses this methodological gap by providing a natively parallelised (MPI, OpenMP, and CUDA) matrix-free AMReX backend capable of scaling across thousands of compute cores. Through its modern Python application programming interface (API), `OpenImpala` serves as a robust upstream microstructural parameterisation engine. It is capable of ingesting raw or segmented tomographic data and seamlessly exporting computed effective macroscopic properties to downstream continuum models, such as the widely adopted PyBaMM battery modelling framework.
+
+# Major Advancements Since Initial Publication
+
+`OpenImpala` was originally introduced as a command-line hybrid C++ and Fortran framework optimized for solving Laplace's equation to extract tortuosity. Since its initial publication in *SoftwareX*, the framework has undergone a complete architectural transformation. This overhaul was motivated by three primary objectives: achieving exascale readiness on heterogeneous hardware, enabling seamless interoperability with the broader scientific Python ecosystem, and ensuring long-term software sustainability.
+
+To address the computational demands of increasingly massive tomographic datasets, the core architecture was heavily modernised. The legacy Fortran compute kernels were entirely deprecated in favor of a pure C++ infrastructure utilizing native AMReX Lambdas. This shift facilitated the implementation of native GPU acceleration via CUDA, allowing the solver to fully leverage modern heterogeneous supercomputing architectures. Furthermore, to optimize memory utilization during massive out-of-core computations, the monolithic diffusion driver was deconstructed. The framework now utilizes intelligent solver routing driven by AMReX’s Matrix-Free Multi-Level Multi-Grid (MLMG) infrastructure, all managed by a modernised, modular CMake build system.
+
+Beyond raw computational performance, the framework was redesigned to function as an accessible, upstream parameterisation engine for modern continuum modelling workflows. Recognizing that the battery and geoscience modelling communities operate predominantly within Python, `OpenImpala` was transitioned from a standalone compiled tool to a fully importable Python library (`import openimpala`) via `pybind11`. Distribution was modernised using `pyproject.toml` and `cibuildwheel` to enable standard package manager installations (e.g., `pip`). This transition to a community-accessible library is supported by a comprehensive Sphinx/ReadTheDocs infrastructure, featuring extensive tutorial series and Doxygen-generated API references. 
+
+Finally, the physical fidelity and scientific rigor of the framework were significantly expanded. `OpenImpala` now supports complex multi-phase computations, allowing users to map specific phase identifiers to unique transport coefficients, alongside an integrated microstructural parameterisation engine capable of extracting macroscopic metrics such as Specific Surface Area (SSA), Representative Elementary Volumes (REV), and Pore Size Distributions (PSD). To guarantee the mathematical correctness of these new physics modules, rigorous software engineering practices were adopted, including automated CI/CD GitHub Actions pipelines, integrated Catch2 testing, synthetic analytical benchmarking, and comprehensive input validation to prevent silent numerical failures in complex HPC environments.
+
+* **Automated CI/CD:** Deployed rigorous GitHub Actions pipelines for automated code formatting (`clang-format`), static analysis (`clang-tidy`), and test coverage reporting via Codecov.
+
+# Future Directions
+
+Active development is focused on deepening integration with the broader scientific Python ecosystem. This includes the implementation of a direct memory-coupling API for PyBaMM, enabling researchers to perform 3D microstructural parameterisation and 1D electrochemical simulation in a single, zero-copy Python script.
+
+# Acknowledgements
+
+This work was financially supported by the EPSRC Centre for Doctoral Training (CDT) in Energy Storage and its Applications [grant ref: EP/R021295/1]; the Ada Lovelace Centre (ALC) STFC project, CANVAS-NXtomo; the EPSRC prosperity partnership with Imperial College, INFUSE [grant ref: EP/V038044/1]; the Rutherford Appleton Laboratory; The Faraday Institution through James Le Houx's Emerging Leader Fellowship [Grant No. FIELF001]; and Research England’s ‘Expanding Excellence in England’ grant at the University of Greenwich via the “Multi-scale Multi-disciplinary Modelling for Impact” (M34Impact) programme.
+
+The authors acknowledge the use of the IRIDIS High Performance Computing Facility, Diamond Light Source's Wilson HPC cluster, STFC's SCARF cluster, and the University of Greenwich's M34Impact HPC Cluster. We also thank the developers of AMReX, HYPRE, libtiff, and HDF5, upon which OpenImpala relies.
+
+# References

From e889e251b8ca00cfdf25fc4ec6966c38f1be173a Mon Sep 17 00:00:00 2001
From: James Le Houx <37665786+jameslehoux@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:39:20 +0100
Subject: [PATCH 3/6] Add bibliography entries for various research articles

Added multiple references to the bibliography file including articles on OpenImpala, statistical effective diffusivity estimation, AMReX framework, and Python Battery Mathematical Modelling.
---
 paper.bib | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 paper.bib

diff --git a/paper.bib b/paper.bib
new file mode 100644
index 0000000..999e855
--- /dev/null
+++ b/paper.bib
@@ -0,0 +1,73 @@
+@article{LeHoux2021OpenImpala,
+  title = {{{OpenImpala}}: {{OPEN}} source {{IMage}} based {{PArallisable}} {{Linear}} {{Algebra}} solver},
+  author = {Le Houx, James and Kramer, Denis},
+  year = {2021},
+  journal = {SoftwareX},
+  volume = {15},
+  pages = {100729},
+  doi = {10.1016/j.softx.2021.100729},
+  issn = {2352-7110}
+}
+
+@article{le2023statistical,
+  title={Statistical Effective Diffusivity Estimation in Porous Media Using an Integrated On-site Imaging Workflow for Synchrotron Users},
+  author={Le Houx, James and Ruiz, Siul and McKay Fletcher, Daniel and Ahmed, Sharif and Roose, Tiina},
+  journal={Transport in Porous Media},
+  volume={150},
+  number={1},
+  pages={71--88},
+  year={2023},
+  publisher={Springer},
+  doi={10.1007/s11242-023-01993-7}
+}
+
+@article{amrex2019,
+  title={AMReX: a framework for block-structured adaptive mesh refinement},
+  author={Zhang, Weiqun and Almgren, Ann and Beckner, Vince and Bell, John and Blaschke, Johannes and Chan, Cy and Day, Marcus and Friesen, Brian and Gott, Kevin and Graves, Daniel and others},
+  journal={Journal of Open Source Software},
+  volume={4},
+  number={37},
+  pages={1370},
+  year={2019},
+  doi={10.21105/joss.01370}
+}
+
+@article{lu2025immersed,
+  title={An immersed interface Adaptive Mesh Refinement algorithm for Li-ion battery simulations. II. Multi-dimensional extension and separator modeling},
+  author={Lu, Jiawei and Nikiforakis, Nikolaos and Gokhale, Nandan},
+  journal={Journal of Applied Physics},
+  volume={138},
+  number={4},
+  pages={045002},
+  year={2025},
+  publisher={AIP Publishing},
+  doi={10.1063/5.0281626}
+}
+
+@article{sulzer2021python,
+  title={{Python Battery Mathematical Modelling (PyBaMM)}},
+  author={Sulzer, Valentin and Marquis, Scott G and Timms, Robert and Robinson, Martin and Chapman, S Jon},
+  journal={Journal of Open Research Software},
+  volume={9},
+  number={1},
+  year={2021},
+  publisher={Ubiquity Press},
+  doi={10.5334/jors.309}
+}
+
+@inproceedings{falgout2002hypre,
+  title={hypre: A library of high performance preconditioners},
+  author={Falgout, Robert D and Yang, Ulrike Meier},
+  booktitle={International Conference on Computational Science},
+  pages={632--641},
+  year={2002},
+  organization={Springer},
+  doi={10.1007/3-540-47789-6_66}
+}
+
+@misc{jakob2017pybind11,
+   author = {Wenzel Jakob and Jason Rhinelander and Dean Moldovan},
+   year = {2017},
+   note = {https://github.com/pybind/pybind11},
+   title = {pybind11 -- Seamless operability between C++11 and Python}
+}

From d756f42d13fae4f1527fb898073465ef7179ae79 Mon Sep 17 00:00:00 2001
From: James Le Houx <37665786+jameslehoux@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:42:29 +0100
Subject: [PATCH 4/6] Revise software architecture and capabilities section

Updated the software architecture section to reflect recent changes and improvements in the OpenImpala framework, including its transition to a Python library and enhancements in computational capabilities.
---
 paper.md | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/paper.md b/paper.md
index 1aa428e..5df46ce 100644
--- a/paper.md
+++ b/paper.md
@@ -39,17 +39,13 @@ High-resolution three-dimensional microstructural imaging is increasingly utiliz
 
 `OpenImpala` addresses this methodological gap by providing a natively parallelised (MPI, OpenMP, and CUDA) matrix-free AMReX backend capable of scaling across thousands of compute cores. Through its modern Python application programming interface (API), `OpenImpala` serves as a robust upstream microstructural parameterisation engine. It is capable of ingesting raw or segmented tomographic data and seamlessly exporting computed effective macroscopic properties to downstream continuum models, such as the widely adopted PyBaMM battery modelling framework.
 
-# Major Advancements Since Initial Publication
+# Software Architecture and Capabilities
 
-`OpenImpala` was originally introduced as a command-line hybrid C++ and Fortran framework optimized for solving Laplace's equation to extract tortuosity. Since its initial publication in *SoftwareX*, the framework has undergone a complete architectural transformation. This overhaul was motivated by three primary objectives: achieving exascale readiness on heterogeneous hardware, enabling seamless interoperability with the broader scientific Python ecosystem, and ensuring long-term software sustainability.
+`OpenImpala` is engineered to achieve exascale readiness on heterogeneous hardware, enable seamless interoperability with the broader scientific Python ecosystem, and ensure long-term software sustainability. To address the computational demands of increasingly massive tomographic datasets, the core architecture is built upon a pure C++ infrastructure utilizing native AMReX Lambdas. This design facilitates native GPU acceleration via CUDA, allowing the solver to fully leverage modern heterogeneous supercomputing architectures. Furthermore, to optimize memory utilization during massive out-of-core computations, the framework utilizes intelligent solver routing driven by AMReX’s Matrix-Free Multi-Level Multi-Grid (MLMG) infrastructure, all managed by a modern, modular CMake build system.
 
-To address the computational demands of increasingly massive tomographic datasets, the core architecture was heavily modernised. The legacy Fortran compute kernels were entirely deprecated in favor of a pure C++ infrastructure utilizing native AMReX Lambdas. This shift facilitated the implementation of native GPU acceleration via CUDA, allowing the solver to fully leverage modern heterogeneous supercomputing architectures. Furthermore, to optimize memory utilization during massive out-of-core computations, the monolithic diffusion driver was deconstructed. The framework now utilizes intelligent solver routing driven by AMReX’s Matrix-Free Multi-Level Multi-Grid (MLMG) infrastructure, all managed by a modernised, modular CMake build system.
+Beyond raw computational performance, the framework functions as an accessible, upstream parameterisation engine for modern continuum modelling workflows. Recognizing that the battery and geoscience modelling communities operate predominantly within Python, `OpenImpala` acts as a fully importable Python library (`import openimpala`) via `pybind11`. Distribution is handled using `pyproject.toml` and `cibuildwheel` to enable standard package manager installations (e.g., `pip`). This community-accessible library is supported by a comprehensive Sphinx and ReadTheDocs infrastructure, featuring extensive tutorial series and Doxygen-generated API references. 
 
-Beyond raw computational performance, the framework was redesigned to function as an accessible, upstream parameterisation engine for modern continuum modelling workflows. Recognizing that the battery and geoscience modelling communities operate predominantly within Python, `OpenImpala` was transitioned from a standalone compiled tool to a fully importable Python library (`import openimpala`) via `pybind11`. Distribution was modernised using `pyproject.toml` and `cibuildwheel` to enable standard package manager installations (e.g., `pip`). This transition to a community-accessible library is supported by a comprehensive Sphinx/ReadTheDocs infrastructure, featuring extensive tutorial series and Doxygen-generated API references. 
-
-Finally, the physical fidelity and scientific rigor of the framework were significantly expanded. `OpenImpala` now supports complex multi-phase computations, allowing users to map specific phase identifiers to unique transport coefficients, alongside an integrated microstructural parameterisation engine capable of extracting macroscopic metrics such as Specific Surface Area (SSA), Representative Elementary Volumes (REV), and Pore Size Distributions (PSD). To guarantee the mathematical correctness of these new physics modules, rigorous software engineering practices were adopted, including automated CI/CD GitHub Actions pipelines, integrated Catch2 testing, synthetic analytical benchmarking, and comprehensive input validation to prevent silent numerical failures in complex HPC environments.
-
-* **Automated CI/CD:** Deployed rigorous GitHub Actions pipelines for automated code formatting (`clang-format`), static analysis (`clang-tidy`), and test coverage reporting via Codecov.
+Finally, the physical fidelity and scientific rigor of the framework are a primary focus. `OpenImpala` supports complex multi-phase computations, allowing users to map specific phase identifiers to unique transport coefficients. It also features an integrated microstructural parameterisation engine capable of extracting macroscopic metrics such as Specific Surface Area (SSA), Representative Elementary Volumes (REV), and Pore Size Distributions (PSD). To guarantee the mathematical correctness of these physics modules, rigorous software engineering practices are enforced. This includes comprehensive input validation to prevent silent numerical failures in complex HPC environments, synthetic analytical benchmarking, integrated Catch2 testing, and automated CI/CD GitHub Actions pipelines for code formatting (`clang-format`), static analysis (`clang-tidy`), and test coverage reporting via Codecov.
 
 # Future Directions
 

From 395b8ec1947ede49d6d528313a4a762882ec5c7e Mon Sep 17 00:00:00 2001
From: James Le Houx <james.lehoux@gre.ac.uk>
Date: Sun, 29 Mar 2026 20:08:16 +0000
Subject: [PATCH 5/6] Add performance profiling and CI benchmark workflow (#31)

- Enhance profiling notebook with AMReX TinyProfiler breakdown
  (solver setup vs linear solve vs flux computation) and NVIDIA
  Nsight Systems GPU kernel profiling for Colab T4 runtimes
- Add CI benchmark workflow that runs on PRs touching solver code,
  tests against analytical solutions (uniform block tau=(N-1)/N),
  and posts timing results as PR comments for regression tracking

https://claude.ai/code/session_01RKnn97qiD7sbCeABHH3eQk
---
 .github/workflows/benchmark.yml      | 306 +++++++++++++++++++
 notebooks/profiling_and_tuning.ipynb | 429 +++++++++++++++++++++++----
 2 files changed, 680 insertions(+), 55 deletions(-)
 create mode 100644 .github/workflows/benchmark.yml

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..2d348bf
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,306 @@
+# .github/workflows/benchmark.yml
+# Performance regression tracking for OpenImpala.
+#
+# Runs a fixed set of synthetic benchmarks and posts timing results
+# as a PR comment or job summary. Triggered manually or on PRs that
+# touch solver code.
+
+name: Performance Benchmark
+
+on:
+  pull_request:
+    paths:
+      - 'src/props/**'
+      - 'python/bindings/**'
+      - 'python/openimpala/**'
+      - 'CMakeLists.txt'
+  workflow_dispatch:
+    inputs:
+      grid_sizes:
+        description: 'Comma-separated grid sizes to benchmark (e.g. 64,128)'
+        required: false
+        default: '64,128'
+
+concurrency:
+  group: benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  benchmark:
+    name: Run Benchmarks
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Apptainer
+        uses: eWaterCycle/setup-apptainer@v2
+        with:
+          apptainer-version: 1.2.5
+
+      - name: Restore Dependency SIF Cache
+        id: cache-restore-sif
+        uses: actions/cache/restore@v4
+        with:
+          path: dependency_image.sif
+          key: ${{ runner.os }}-apptainer-sif-${{ hashFiles('containers/Singularity.deps.def') }}
+          restore-keys: |
+            ${{ runner.os }}-apptainer-sif-
+
+      - name: Build Dependency SIF (if cache miss)
+        if: steps.cache-restore-sif.outputs.cache-hit != 'true'
+        run: |
+          sudo apptainer build --force dependency_image.sif containers/Singularity.deps.def
+
+      - name: Build OpenImpala with Python bindings
+        run: |
+          sudo apptainer exec \
+            --writable-tmpfs \
+            --bind $PWD:/src \
+            ./dependency_image.sif \
+            bash -c '
+              source /opt/rh/gcc-toolset-11/enable
+              cd /src
+              rm -rf cmake_build_bench && mkdir cmake_build_bench && cd cmake_build_bench
+
+              PYBIND11_DIR=$(python3.11 -m pybind11 --cmakedir)
+
+              cmake .. \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DCMAKE_C_COMPILER=$(which mpicc) \
+                -DCMAKE_CXX_COMPILER=$(which mpicxx) \
+                -DPython_EXECUTABLE=/usr/bin/python3.11 \
+                -Dpybind11_DIR=${PYBIND11_DIR} \
+                -DCMAKE_PREFIX_PATH="/opt/amrex/25.03;/opt/hypre/v2.32.0;/opt/hdf5/1.12.3;/opt/libtiff/4.6.0" \
+                -DBUILD_TESTING=OFF \
+                -DOPENIMPALA_PYTHON=ON
+
+              make -j$(nproc)
+            '
+
+      - name: Run benchmark suite
+        id: benchmark
+        run: |
+          # Determine grid sizes
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SIZES="${{ github.event.inputs.grid_sizes }}"
+          else
+            SIZES="64,128"
+          fi
+
+          cat > /tmp/bench.py << 'BENCHEOF'
+          import sys
+          import os
+          import json
+          import time
+          import numpy as np
+
+          # Add the build directory to Python path
+          sys.path.insert(0, "/src/cmake_build_bench")
+
+          import openimpala as oi
+
+          sizes = [int(s) for s in os.environ.get("BENCH_SIZES", "64,128").split(",")]
+          solvers = ["pcg", "flexgmres", "pfmg", "mlmg"]
+          n_repeats = 3
+          results = []
+
+          with oi.Session():
+              for size in sizes:
+                  print(f"\n=== Benchmarking {size}³ ===", flush=True)
+
+                  # Generate uniform block (analytical solution: tau = (N-1)/N)
+                  data = np.zeros((size, size, size), dtype=np.int32)
+
+                  for solver_name in solvers:
+                      times = []
+                      res = None
+                      failed = False
+
+                      for trial in range(n_repeats):
+                          try:
+                              t0 = time.perf_counter()
+                              res = oi.tortuosity(
+                                  data, phase=0, direction="z",
+                                  solver=solver_name, max_grid_size=32, verbose=0
+                              )
+                              t1 = time.perf_counter()
+                              times.append(t1 - t0)
+                          except Exception as e:
+                              print(f"  {solver_name}: FAILED — {e}")
+                              failed = True
+                              break
+
+                      if not failed and res is not None:
+                          median_t = float(np.median(times))
+                          expected_tau = (size - 1) / size
+                          tau_error = abs(res.tortuosity - expected_tau) / expected_tau
+
+                          record = {
+                              "size": size,
+                              "solver": solver_name,
+                              "wall_time_s": round(median_t, 4),
+                              "tortuosity": round(res.tortuosity, 6),
+                              "expected_tau": round(expected_tau, 6),
+                              "relative_error": round(tau_error, 8),
+                              "iterations": res.iterations,
+                              "converged": res.solver_converged,
+                          }
+                          results.append(record)
+                          print(f"  {solver_name:12s}  {median_t:.4f}s  "
+                                f"τ={res.tortuosity:.6f}  "
+                                f"err={tau_error:.2e}  "
+                                f"iters={res.iterations}")
+                      else:
+                          results.append({
+                              "size": size, "solver": solver_name,
+                              "wall_time_s": None, "tortuosity": None,
+                              "expected_tau": (size - 1) / size,
+                              "relative_error": None, "iterations": None,
+                              "converged": False,
+                          })
+
+          # Write results to JSON
+          with open("/tmp/benchmark_results.json", "w") as f:
+              json.dump(results, f, indent=2)
+
+          print(f"\n{len(results)} benchmark results written.")
+          BENCHEOF
+
+          sudo apptainer exec \
+            --writable-tmpfs \
+            --bind $PWD:/src \
+            ./dependency_image.sif \
+            bash -c "
+              source /opt/rh/gcc-toolset-11/enable
+              export PYTHONPATH=/src/cmake_build_bench
+              export BENCH_SIZES=$SIZES
+              cd /src && python3.11 /tmp/bench.py
+            "
+
+          # Copy results out for the next step
+          cp /tmp/benchmark_results.json ./benchmark_results.json 2>/dev/null || true
+
+      - name: Generate benchmark report
+        id: report
+        run: |
+          python3 << 'REPORTEOF'
+          import json
+          import os
+
+          with open("benchmark_results.json") as f:
+              results = json.load(f)
+
+          # Build markdown table
+          lines = []
+          lines.append("## Performance Benchmark Results")
+          lines.append("")
+          lines.append("| Size | Solver | Wall Time (s) | Tortuosity | Expected | Rel. Error | Iters | Status |")
+          lines.append("|------|--------|--------------|------------|----------|-----------|-------|--------|")
+
+          any_failed = False
+          any_regression = False
+
+          for r in results:
+              if r["converged"]:
+                  err_str = f'{r["relative_error"]:.2e}'
+                  status = "PASS" if r["relative_error"] < 1e-4 else "WARN"
+                  if r["relative_error"] >= 1e-4:
+                      any_regression = True
+                      status = "WARN"
+              else:
+                  err_str = "N/A"
+                  status = "FAIL"
+                  any_failed = True
+
+              time_str = f'{r["wall_time_s"]:.4f}' if r["wall_time_s"] else "N/A"
+              tau_str = f'{r["tortuosity"]:.6f}' if r["tortuosity"] else "N/A"
+              exp_str = f'{r["expected_tau"]:.6f}'
+              iter_str = str(r["iterations"]) if r["iterations"] else "N/A"
+
+              lines.append(f'| {r["size"]}³ | {r["solver"]} | {time_str} | {tau_str} | {exp_str} | {err_str} | {iter_str} | {status} |')
+
+          lines.append("")
+
+          # Summary
+          converged = [r for r in results if r["converged"]]
+          if converged:
+              fastest = min(converged, key=lambda r: r["wall_time_s"])
+              lines.append(f'**Fastest solver:** {fastest["solver"]} at {fastest["size"]}³ '
+                           f'({fastest["wall_time_s"]:.4f}s)')
+          if any_failed:
+              lines.append("**Warning:** Some solvers failed to converge.")
+          if any_regression:
+              lines.append("**Warning:** Some results exceed expected error tolerance (>1e-4).")
+
+          lines.append("")
+          lines.append("*Benchmark: uniform block (analytical τ = (N-1)/N)*")
+
+          report = "\n".join(lines)
+
+          # Write to step summary
+          with open(os.environ.get("GITHUB_STEP_SUMMARY", "/dev/null"), "a") as f:
+              f.write(report)
+
+          # Write to file for PR comment
+          with open("benchmark_report.md", "w") as f:
+              f.write(report)
+
+          print(report)
+          REPORTEOF
+
+      - name: Post benchmark results to PR
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let report = '';
+            try {
+              report = fs.readFileSync('benchmark_report.md', 'utf8');
+            } catch (e) {
+              report = 'Benchmark report not available.';
+            }
+
+            const marker = '<!-- benchmark-report -->';
+            const body = marker + '\n' + report;
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const existing = comments.find(c => c.body.includes(marker));
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body: body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: body,
+              });
+            }
+
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ github.run_id }}
+          path: |
+            benchmark_results.json
+            benchmark_report.md
+          retention-days: 90
+          if-no-files-found: warn
diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb
index 83866cc..b468dbd 100644
--- a/notebooks/profiling_and_tuning.ipynb
+++ b/notebooks/profiling_and_tuning.ipynb
@@ -19,23 +19,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/main/notebooks/profiling_and_tuning.ipynb)\n",
-    "\n",
-    "# OpenImpala Interactive Profiling & Tuning\n",
-    "\n",
-    "This notebook profiles solver performance, tunes AMReX grid decomposition, and compares scaling behavior — all within Google Colab's free tier (T4 GPU, ~12 GB RAM).\n",
-    "\n",
-    "**Sections:**\n",
-    "1. Synthetic Dataset Generation\n",
-    "2. Baseline Validation\n",
-    "3. Solver Profiling Sweep\n",
-    "4. Grid Size Sweep (`max_grid_size`)\n",
-    "5. Dataset Size Scaling\n",
-    "6. Direction Anisotropy Check\n",
-    "7. Multi-Porosity Comparison\n",
-    "8. Summary & HPC Recommendations"
-   ]
+   "source": "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/main/notebooks/profiling_and_tuning.ipynb)\n\n# OpenImpala Interactive Profiling & Tuning\n\nThis notebook profiles solver performance, tunes AMReX grid decomposition, and compares scaling behavior \u2014 all within Google Colab's free tier (T4 GPU, ~12 GB RAM).\n\n**Sections:**\n1. Synthetic Dataset Generation\n2. Baseline Validation\n3. Solver Profiling Sweep\n4. Grid Size Sweep (`max_grid_size`)\n5. Dataset Size Scaling\n6. Direction Anisotropy Check\n7. Multi-Porosity Comparison\n8. **AMReX TinyProfiler Breakdown** *(solver setup vs solve vs flux)*\n9. **NVIDIA Nsight Systems GPU Profiling** *(kernel-level timeline)*\n10. Summary & HPC Recommendations"
   },
   {
    "cell_type": "markdown",
@@ -61,10 +45,10 @@
     "    if gpu_name:\n",
     "        print(f\"GPU detected: {gpu_name}\")\n",
     "    else:\n",
-    "        print(\"No GPU detected — running CPU-only.\")\n",
+    "        print(\"No GPU detected \u2014 running CPU-only.\")\n",
     "except FileNotFoundError:\n",
     "    gpu_name = \"\"\n",
-    "    print(\"nvidia-smi not found — running CPU-only.\")"
+    "    print(\"nvidia-smi not found \u2014 running CPU-only.\")"
    ]
   },
   {
@@ -147,9 +131,9 @@
     "data_medium = generate_microstructure(128)\n",
     "data_large = generate_microstructure(256)\n",
     "\n",
-    "for name, data in [(\"small (64³)\", data_small),\n",
-    "                    (\"medium (128³)\", data_medium),\n",
-    "                    (\"large (256³)\", data_large)]:\n",
+    "for name, data in [(\"small (64\u00b3)\", data_small),\n",
+    "                    (\"medium (128\u00b3)\", data_medium),\n",
+    "                    (\"large (256\u00b3)\", data_large)]:\n",
     "    vf = np.sum(data == 0) / data.size\n",
     "    print(f\"{name:16s}  shape={str(data.shape):20s}  porosity={vf:.3f}\")"
    ]
@@ -164,7 +148,7 @@
     "fig, ax = plt.subplots(figsize=(6, 6))\n",
     "mid_z = data_medium.shape[0] // 2\n",
     "ax.imshow(data_medium[mid_z, :, :], cmap=\"gray\", origin=\"lower\")\n",
-    "ax.set_title(f\"data_medium — Z-slice {mid_z} (white=solid, black=pore)\")\n",
+    "ax.set_title(f\"data_medium \u2014 Z-slice {mid_z} (white=solid, black=pore)\")\n",
     "ax.set_xlabel(\"X\")\n",
     "ax.set_ylabel(\"Y\")\n",
     "plt.tight_layout()\n",
@@ -175,7 +159,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Section 2: Baseline — Quick Validation\n",
+    "## Section 2: Baseline \u2014 Quick Validation\n",
     "\n",
     "Run a quick sanity check on `data_small` to confirm the solver pipeline works."
    ]
@@ -204,7 +188,7 @@
    "source": [
     "## Section 3: Solver Profiling Sweep\n",
     "\n",
-    "Profile all available HYPRE solvers on `data_medium` (128³). Each solver is run 3 times; we report the median wall time."
+    "Profile all available HYPRE solvers on `data_medium` (128\u00b3). Each solver is run 3 times; we report the median wall time."
    ]
   },
   {
@@ -231,7 +215,7 @@
     "            times.append(t1 - t0)\n",
     "            last_result = res\n",
     "        except Exception as e:\n",
-    "            print(f\"  {solver_name} trial {trial}: FAILED — {e}\")\n",
+    "            print(f\"  {solver_name} trial {trial}: FAILED \u2014 {e}\")\n",
     "            failed = True\n",
     "            break\n",
     "\n",
@@ -251,7 +235,7 @@
     "            \"converged\": last_result.solver_converged\n",
     "        })\n",
     "    status = \"OK\" if not failed else \"FAILED\"\n",
-    "    print(f\"  {solver_name:12s} — {status}\")\n",
+    "    print(f\"  {solver_name:12s} \u2014 {status}\")\n",
     "\n",
     "df_solvers = pd.DataFrame(solver_records)\n",
     "df_solvers"
@@ -262,7 +246,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Dual-axis bar chart: wall time (bars) + iteration count (dots)\nfig, ax1 = plt.subplots(figsize=(10, 5))\n\ncolors = [\"#2ecc71\" if c else \"#e74c3c\" for c in df_solvers[\"converged\"]]\ny_pos = np.arange(len(df_solvers))\nbars = ax1.barh(y_pos, df_solvers[\"wall_time_s\"], color=colors, alpha=0.85,\n                edgecolor=\"white\", linewidth=0.8)\nax1.set_yticks(y_pos)\nax1.set_yticklabels(df_solvers[\"solver\"])\nax1.set_xlabel(\"Wall Time (s)\", color=\"#2c3e50\")\nax1.set_title(\"Solver Profiling — 128³ dataset\", fontsize=13, fontweight=\"bold\")\nax1.tick_params(axis=\"x\", colors=\"#2c3e50\")\n\n# Second x-axis for iteration count\nax2 = ax1.twiny()\nax2.plot(df_solvers[\"iterations\"], y_pos, \"D\", color=\"#8e44ad\", markersize=8,\n         zorder=5, label=\"Iterations\")\nax2.set_xlabel(\"Iterations\", color=\"#8e44ad\")\nax2.tick_params(axis=\"x\", colors=\"#8e44ad\")\n\n# Convergence legend\nfrom matplotlib.patches import Patch\nlegend_elements = [Patch(facecolor=\"#2ecc71\", label=\"Converged\"),\n                   Patch(facecolor=\"#e74c3c\", label=\"Failed\"),\n                   plt.Line2D([0], [0], marker=\"D\", color=\"#8e44ad\", linestyle=\"None\",\n                              markersize=8, label=\"Iterations\")]\nax1.legend(handles=legend_elements, loc=\"lower right\", framealpha=0.9)\n\nplt.tight_layout()\nplt.show()"
+   "source": "# Dual-axis bar chart: wall time (bars) + iteration count (dots)\nfig, ax1 = plt.subplots(figsize=(10, 5))\n\ncolors = [\"#2ecc71\" if c else \"#e74c3c\" for c in df_solvers[\"converged\"]]\ny_pos = np.arange(len(df_solvers))\nbars = ax1.barh(y_pos, df_solvers[\"wall_time_s\"], color=colors, alpha=0.85,\n                edgecolor=\"white\", linewidth=0.8)\nax1.set_yticks(y_pos)\nax1.set_yticklabels(df_solvers[\"solver\"])\nax1.set_xlabel(\"Wall Time (s)\", color=\"#2c3e50\")\nax1.set_title(\"Solver Profiling \u2014 128\u00b3 dataset\", fontsize=13, fontweight=\"bold\")\nax1.tick_params(axis=\"x\", colors=\"#2c3e50\")\n\n# Second x-axis for iteration count\nax2 = ax1.twiny()\nax2.plot(df_solvers[\"iterations\"], y_pos, \"D\", color=\"#8e44ad\", markersize=8,\n         zorder=5, label=\"Iterations\")\nax2.set_xlabel(\"Iterations\", color=\"#8e44ad\")\nax2.tick_params(axis=\"x\", colors=\"#8e44ad\")\n\n# Convergence legend\nfrom matplotlib.patches import Patch\nlegend_elements = [Patch(facecolor=\"#2ecc71\", label=\"Converged\"),\n                   Patch(facecolor=\"#e74c3c\", label=\"Failed\"),\n                   plt.Line2D([0], [0], marker=\"D\", color=\"#8e44ad\", linestyle=\"None\",\n                              markersize=8, label=\"Iterations\")]\nax1.legend(handles=legend_elements, loc=\"lower right\", framealpha=0.9)\n\nplt.tight_layout()\nplt.show()"
   },
   {
    "cell_type": "code",
@@ -307,7 +291,7 @@
     "        \"wall_time_s\": np.median(times),\n",
     "        \"tortuosity\": res.tortuosity\n",
     "    })\n",
-    "    print(f\"  max_grid_size={gs:4d}  time={np.median(times):.3f}s  τ={res.tortuosity:.4f}\")\n",
+    "    print(f\"  max_grid_size={gs:4d}  time={np.median(times):.3f}s  \u03c4={res.tortuosity:.4f}\")\n",
     "\n",
     "df_grid = pd.DataFrame(grid_records)\n",
     "df_grid"
@@ -323,14 +307,14 @@
     "ax.plot(df_grid[\"max_grid_size\"], df_grid[\"wall_time_s\"], \"o-\", linewidth=2, markersize=8)\n",
     "ax.set_xlabel(\"max_grid_size\")\n",
     "ax.set_ylabel(\"Wall Time (s)\")\n",
-    "ax.set_title(f\"Grid Decomposition Sweep — 128³, solver={best_solver}\")\n",
+    "ax.set_title(f\"Grid Decomposition Sweep \u2014 128\u00b3, solver={best_solver}\")\n",
     "\n",
     "# Annotations\n",
-    "ax.annotate(\"Smaller boxes →\\nbetter cache locality,\\nmore MPI parallelism\",\n",
+    "ax.annotate(\"Smaller boxes \u2192\\nbetter cache locality,\\nmore MPI parallelism\",\n",
     "            xy=(grid_sizes[0], df_grid[\"wall_time_s\"].iloc[0]),\n",
     "            xytext=(grid_sizes[1], df_grid[\"wall_time_s\"].max() * 0.9),\n",
     "            arrowprops=dict(arrowstyle=\"->\", color=\"gray\"), fontsize=9, color=\"gray\")\n",
-    "ax.annotate(\"Larger boxes →\\nbetter GPU saturation,\\nless overhead\",\n",
+    "ax.annotate(\"Larger boxes \u2192\\nbetter GPU saturation,\\nless overhead\",\n",
     "            xy=(grid_sizes[-1], df_grid[\"wall_time_s\"].iloc[-1]),\n",
     "            xytext=(grid_sizes[-2], df_grid[\"wall_time_s\"].max() * 0.9),\n",
     "            arrowprops=dict(arrowstyle=\"->\", color=\"gray\"), fontsize=9, color=\"gray\")\n",
@@ -356,7 +340,7 @@
   },
   {
    "cell_type": "code",
-   "source": "# Solver x Grid Size heatmap — full 2D parameter sweep\nconverged_solvers = df_solvers[df_solvers[\"converged\"]][\"solver\"].tolist()\nheatmap_grid_sizes = [8, 16, 32, 64, 128]\nheatmap_data = np.full((len(converged_solvers), len(heatmap_grid_sizes)), np.nan)\n\nfor i, s in enumerate(converged_solvers):\n    for j, gs in enumerate(heatmap_grid_sizes):\n        try:\n            t0 = time.perf_counter()\n            oi.tortuosity(data_medium, phase=0, direction=\"z\",\n                          solver=s, max_grid_size=gs, verbose=0)\n            heatmap_data[i, j] = time.perf_counter() - t0\n        except Exception:\n            heatmap_data[i, j] = np.nan\n    print(f\"  {s} done\")\n\nfig, ax = plt.subplots(figsize=(9, max(4, len(converged_solvers) * 0.7)))\nim = ax.imshow(heatmap_data, cmap=\"YlOrRd\", aspect=\"auto\")\n\n# Annotate each cell with its value\nfor i in range(heatmap_data.shape[0]):\n    for j in range(heatmap_data.shape[1]):\n        val = heatmap_data[i, j]\n        if not np.isnan(val):\n            text_color = \"white\" if val > np.nanmedian(heatmap_data) else \"black\"\n            ax.text(j, i, f\"{val:.2f}s\", ha=\"center\", va=\"center\",\n                    fontsize=9, fontweight=\"bold\", color=text_color)\n\nax.set_xticks(range(len(heatmap_grid_sizes)))\nax.set_xticklabels(heatmap_grid_sizes)\nax.set_yticks(range(len(converged_solvers)))\nax.set_yticklabels(converged_solvers)\nax.set_xlabel(\"max_grid_size\")\nax.set_ylabel(\"Solver\")\nax.set_title(\"Solver x Grid Size — Wall Time (s)\", fontsize=13, fontweight=\"bold\")\n\n# Mark the global minimum\nbest_idx = np.unravel_index(np.nanargmin(heatmap_data), heatmap_data.shape)\nax.add_patch(plt.Rectangle((best_idx[1] - 0.5, best_idx[0] - 0.5), 1, 1,\n             fill=False, edgecolor=\"#2ecc71\", linewidth=3, linestyle=\"--\"))\nax.text(best_idx[1], best_idx[0] + 0.35, \"BEST\", ha=\"center\", va=\"top\",\n        fontsize=7, color=\"#2ecc71\", fontweight=\"bold\")\n\ncbar = plt.colorbar(im, ax=ax, shrink=0.8)\ncbar.set_label(\"Wall Time (s)\")\nplt.tight_layout()\nplt.show()",
+   "source": "# Solver x Grid Size heatmap \u2014 full 2D parameter sweep\nconverged_solvers = df_solvers[df_solvers[\"converged\"]][\"solver\"].tolist()\nheatmap_grid_sizes = [8, 16, 32, 64, 128]\nheatmap_data = np.full((len(converged_solvers), len(heatmap_grid_sizes)), np.nan)\n\nfor i, s in enumerate(converged_solvers):\n    for j, gs in enumerate(heatmap_grid_sizes):\n        try:\n            t0 = time.perf_counter()\n            oi.tortuosity(data_medium, phase=0, direction=\"z\",\n                          solver=s, max_grid_size=gs, verbose=0)\n            heatmap_data[i, j] = time.perf_counter() - t0\n        except Exception:\n            heatmap_data[i, j] = np.nan\n    print(f\"  {s} done\")\n\nfig, ax = plt.subplots(figsize=(9, max(4, len(converged_solvers) * 0.7)))\nim = ax.imshow(heatmap_data, cmap=\"YlOrRd\", aspect=\"auto\")\n\n# Annotate each cell with its value\nfor i in range(heatmap_data.shape[0]):\n    for j in range(heatmap_data.shape[1]):\n        val = heatmap_data[i, j]\n        if not np.isnan(val):\n            text_color = \"white\" if val > np.nanmedian(heatmap_data) else \"black\"\n            ax.text(j, i, f\"{val:.2f}s\", ha=\"center\", va=\"center\",\n                    fontsize=9, fontweight=\"bold\", color=text_color)\n\nax.set_xticks(range(len(heatmap_grid_sizes)))\nax.set_xticklabels(heatmap_grid_sizes)\nax.set_yticks(range(len(converged_solvers)))\nax.set_yticklabels(converged_solvers)\nax.set_xlabel(\"max_grid_size\")\nax.set_ylabel(\"Solver\")\nax.set_title(\"Solver x Grid Size \u2014 Wall Time (s)\", fontsize=13, fontweight=\"bold\")\n\n# Mark the global minimum\nbest_idx = np.unravel_index(np.nanargmin(heatmap_data), heatmap_data.shape)\nax.add_patch(plt.Rectangle((best_idx[1] - 0.5, best_idx[0] - 0.5), 1, 1,\n             fill=False, edgecolor=\"#2ecc71\", linewidth=3, linestyle=\"--\"))\nax.text(best_idx[1], best_idx[0] + 0.35, \"BEST\", ha=\"center\", va=\"top\",\n        fontsize=7, color=\"#2ecc71\", fontweight=\"bold\")\n\ncbar = plt.colorbar(im, ax=ax, shrink=0.8)\ncbar.set_label(\"Wall Time (s)\")\nplt.tight_layout()\nplt.show()",
    "metadata": {},
    "execution_count": null,
    "outputs": []
@@ -376,7 +360,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "datasets = [(\"64³\", data_small, 64), (\"128³\", data_medium, 128), (\"256³\", data_large, 256)]\n",
+    "datasets = [(\"64\u00b3\", data_small, 64), (\"128\u00b3\", data_medium, 128), (\"256\u00b3\", data_large, 256)]\n",
     "size_records = []\n",
     "\n",
     "for label, data, size in datasets:\n",
@@ -392,7 +376,7 @@
     "        \"label\": label, \"size\": size, \"n_voxels\": size**3,\n",
     "        \"wall_time_s\": median_t, \"tortuosity\": res.tortuosity\n",
     "    })\n",
-    "    print(f\"  {label}  time={median_t:.3f}s  τ={res.tortuosity:.4f}\")\n",
+    "    print(f\"  {label}  time={median_t:.3f}s  \u03c4={res.tortuosity:.4f}\")\n",
     "\n",
     "df_size = pd.DataFrame(size_records)\n",
     "df_size"
@@ -403,7 +387,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Log-log plot with filled area under the scaling curve\nlog_n = np.log10(df_size[\"n_voxels\"].values.astype(float))\nlog_t = np.log10(df_size[\"wall_time_s\"].values)\ncoeffs = np.polyfit(log_n, log_t, 1)\nexponent = coeffs[0]\n\n# Dense interpolation for smooth fill\nn_dense = np.logspace(log_n.min(), log_n.max(), 200)\nt_dense = 10 ** np.polyval(coeffs, np.log10(n_dense))\n\nfig, ax = plt.subplots(figsize=(9, 5))\n\n# Filled area under measured curve (gradient effect via stacked fills)\nfrom matplotlib.colors import LinearSegmentedColormap\nn_vals = df_size[\"n_voxels\"].values.astype(float)\nt_vals = df_size[\"wall_time_s\"].values\nax.fill_between(n_dense, t_dense, alpha=0.15, color=\"#3498db\")\nax.fill_between(n_vals, t_vals, alpha=0.25, color=\"#3498db\", step=\"mid\")\n\n# Power-law fit line\nax.loglog(n_dense, t_dense, \"--\", color=\"#95a5a6\", linewidth=1.5,\n          label=f\"Power-law fit: O(N$^{{{exponent:.2f}}}$)\")\n\n# Measured data points (prominent)\nax.loglog(n_vals, t_vals, \"o-\", color=\"#2c3e50\", linewidth=2.5,\n          markersize=12, markerfacecolor=\"#3498db\", markeredgecolor=\"#2c3e50\",\n          markeredgewidth=2, label=\"Measured\", zorder=5)\n\nax.set_xlabel(\"Number of Voxels\", fontsize=11)\nax.set_ylabel(\"Wall Time (s)\", fontsize=11)\nax.set_title(f\"Size Scaling — solver={best_solver}, max_grid_size={best_grid_size}\",\n             fontsize=13, fontweight=\"bold\")\nax.legend(fontsize=10, framealpha=0.9)\n\nfor _, row in df_size.iterrows():\n    ax.annotate(row[\"label\"], (row[\"n_voxels\"], row[\"wall_time_s\"]),\n                textcoords=\"offset points\", xytext=(12, 8), fontsize=10,\n                fontweight=\"bold\", color=\"#2c3e50\",\n                arrowprops=dict(arrowstyle=\"->\", color=\"#bdc3c7\", lw=1.2))\n\nplt.tight_layout()\nplt.show()\nprint(f\"Scaling exponent: {exponent:.2f}\")"
+   "source": "# Log-log plot with filled area under the scaling curve\nlog_n = np.log10(df_size[\"n_voxels\"].values.astype(float))\nlog_t = np.log10(df_size[\"wall_time_s\"].values)\ncoeffs = np.polyfit(log_n, log_t, 1)\nexponent = coeffs[0]\n\n# Dense interpolation for smooth fill\nn_dense = np.logspace(log_n.min(), log_n.max(), 200)\nt_dense = 10 ** np.polyval(coeffs, np.log10(n_dense))\n\nfig, ax = plt.subplots(figsize=(9, 5))\n\n# Filled area under measured curve (gradient effect via stacked fills)\nfrom matplotlib.colors import LinearSegmentedColormap\nn_vals = df_size[\"n_voxels\"].values.astype(float)\nt_vals = df_size[\"wall_time_s\"].values\nax.fill_between(n_dense, t_dense, alpha=0.15, color=\"#3498db\")\nax.fill_between(n_vals, t_vals, alpha=0.25, color=\"#3498db\", step=\"mid\")\n\n# Power-law fit line\nax.loglog(n_dense, t_dense, \"--\", color=\"#95a5a6\", linewidth=1.5,\n          label=f\"Power-law fit: O(N$^{{{exponent:.2f}}}$)\")\n\n# Measured data points (prominent)\nax.loglog(n_vals, t_vals, \"o-\", color=\"#2c3e50\", linewidth=2.5,\n          markersize=12, markerfacecolor=\"#3498db\", markeredgecolor=\"#2c3e50\",\n          markeredgewidth=2, label=\"Measured\", zorder=5)\n\nax.set_xlabel(\"Number of Voxels\", fontsize=11)\nax.set_ylabel(\"Wall Time (s)\", fontsize=11)\nax.set_title(f\"Size Scaling \u2014 solver={best_solver}, max_grid_size={best_grid_size}\",\n             fontsize=13, fontweight=\"bold\")\nax.legend(fontsize=10, framealpha=0.9)\n\nfor _, row in df_size.iterrows():\n    ax.annotate(row[\"label\"], (row[\"n_voxels\"], row[\"wall_time_s\"]),\n                textcoords=\"offset points\", xytext=(12, 8), fontsize=10,\n                fontweight=\"bold\", color=\"#2c3e50\",\n                arrowprops=dict(arrowstyle=\"->\", color=\"#bdc3c7\", lw=1.2))\n\nplt.tight_layout()\nplt.show()\nprint(f\"Scaling exponent: {exponent:.2f}\")"
   },
   {
    "cell_type": "markdown",
@@ -427,7 +411,7 @@
     "    res = oi.tortuosity(data_medium, phase=0, direction=d,\n",
     "                        solver=best_solver, max_grid_size=best_grid_size, verbose=0)\n",
     "    tau_values[d] = res.tortuosity\n",
-    "    print(f\"  τ_{d} = {res.tortuosity:.4f}\")"
+    "    print(f\"  \u03c4_{d} = {res.tortuosity:.4f}\")"
    ]
   },
   {
@@ -435,7 +419,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Radar (spider) chart for directional anisotropy\nlabels = [f\"$\\\\tau_{d}$\" for d in directions]\nvalues = [tau_values[d] for d in directions]\n\n# Close the polygon\nangles = np.linspace(0, 2 * np.pi, len(directions), endpoint=False).tolist()\nvalues_closed = values + [values[0]]\nangles_closed = angles + [angles[0]]\n\nfig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))\n\n# Filled radar area\nax.fill(angles_closed, values_closed, color=\"#3498db\", alpha=0.2)\nax.plot(angles_closed, values_closed, \"o-\", color=\"#3498db\", linewidth=2.5,\n        markersize=10, markerfacecolor=\"white\", markeredgecolor=\"#3498db\",\n        markeredgewidth=2.5)\n\n# Value labels at each vertex\nfor angle, val, d in zip(angles, values, directions):\n    ax.annotate(f\"{val:.3f}\", xy=(angle, val),\n                textcoords=\"offset points\", xytext=(8, 8),\n                fontsize=11, fontweight=\"bold\", color=\"#2c3e50\")\n\n# Isotropic reference circle (mean tortuosity)\ntau_mean = np.mean(values)\nref_angles = np.linspace(0, 2 * np.pi, 100)\nax.plot(ref_angles, [tau_mean] * len(ref_angles), \"--\", color=\"#e74c3c\",\n        linewidth=1.5, alpha=0.6, label=f\"Isotropic ref (τ={tau_mean:.3f})\")\n\nax.set_xticks(angles)\nax.set_xticklabels([f\"$\\\\tau_{{{d.upper()}}}$\" for d in directions], fontsize=13)\nax.set_title(\"Direction Anisotropy — 128³ isotropic blobs\\n\",\n             fontsize=13, fontweight=\"bold\")\nax.legend(loc=\"upper right\", bbox_to_anchor=(1.3, 1.1), framealpha=0.9)\n\nplt.tight_layout()\nplt.show()"
+   "source": "# Radar (spider) chart for directional anisotropy\nlabels = [f\"$\\\\tau_{d}$\" for d in directions]\nvalues = [tau_values[d] for d in directions]\n\n# Close the polygon\nangles = np.linspace(0, 2 * np.pi, len(directions), endpoint=False).tolist()\nvalues_closed = values + [values[0]]\nangles_closed = angles + [angles[0]]\n\nfig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))\n\n# Filled radar area\nax.fill(angles_closed, values_closed, color=\"#3498db\", alpha=0.2)\nax.plot(angles_closed, values_closed, \"o-\", color=\"#3498db\", linewidth=2.5,\n        markersize=10, markerfacecolor=\"white\", markeredgecolor=\"#3498db\",\n        markeredgewidth=2.5)\n\n# Value labels at each vertex\nfor angle, val, d in zip(angles, values, directions):\n    ax.annotate(f\"{val:.3f}\", xy=(angle, val),\n                textcoords=\"offset points\", xytext=(8, 8),\n                fontsize=11, fontweight=\"bold\", color=\"#2c3e50\")\n\n# Isotropic reference circle (mean tortuosity)\ntau_mean = np.mean(values)\nref_angles = np.linspace(0, 2 * np.pi, 100)\nax.plot(ref_angles, [tau_mean] * len(ref_angles), \"--\", color=\"#e74c3c\",\n        linewidth=1.5, alpha=0.6, label=f\"Isotropic ref (\u03c4={tau_mean:.3f})\")\n\nax.set_xticks(angles)\nax.set_xticklabels([f\"$\\\\tau_{{{d.upper()}}}$\" for d in directions], fontsize=13)\nax.set_title(\"Direction Anisotropy \u2014 128\u00b3 isotropic blobs\\n\",\n             fontsize=13, fontweight=\"bold\")\nax.legend(loc=\"upper right\", bbox_to_anchor=(1.3, 1.1), framealpha=0.9)\n\nplt.tight_layout()\nplt.show()"
   },
   {
    "cell_type": "markdown",
@@ -443,7 +427,7 @@
    "source": [
     "## Section 7: Multi-Porosity Comparison\n",
     "\n",
-    "Generate datasets at varying porosities and compare measured tortuosity against the Bruggeman relation (τ = ε^−0.5), a common approximation for isotropic porous media."
+    "Generate datasets at varying porosities and compare measured tortuosity against the Bruggeman relation (\u03c4 = \u03b5^\u22120.5), a common approximation for isotropic porous media."
    ]
   },
   {
@@ -462,14 +446,14 @@
     "    try:\n",
     "        perc = oi.percolation_check(data_p, phase=0, direction=\"z\", verbose=0)\n",
     "        if not perc.percolates:\n",
-    "            print(f\"  ε={p:.1f} — not percolating, skipped\")\n",
+    "            print(f\"  \u03b5={p:.1f} \u2014 not percolating, skipped\")\n",
     "            porosity_records.append({\n",
     "                \"porosity\": p, \"volume_fraction\": vf.fraction,\n",
     "                \"tortuosity\": np.nan, \"percolates\": False\n",
     "            })\n",
     "            continue\n",
     "    except Exception:\n",
-    "        print(f\"  ε={p:.1f} — percolation check failed, skipped\")\n",
+    "        print(f\"  \u03b5={p:.1f} \u2014 percolation check failed, skipped\")\n",
     "        porosity_records.append({\n",
     "            \"porosity\": p, \"volume_fraction\": vf.fraction,\n",
     "            \"tortuosity\": np.nan, \"percolates\": False\n",
@@ -482,7 +466,7 @@
     "        \"porosity\": p, \"volume_fraction\": vf.fraction,\n",
     "        \"tortuosity\": res.tortuosity, \"percolates\": True\n",
     "    })\n",
-    "    print(f\"  ε={p:.1f}  VF={vf.fraction:.3f}  τ={res.tortuosity:.4f}\")\n",
+    "    print(f\"  \u03b5={p:.1f}  VF={vf.fraction:.3f}  \u03c4={res.tortuosity:.4f}\")\n",
     "\n",
     "df_porosity = pd.DataFrame(porosity_records)\n",
     "df_porosity"
@@ -499,7 +483,7 @@
     "# Bruggeman reference\n",
     "eps_ref = np.linspace(0.2, 0.8, 100)\n",
     "tau_brugg = eps_ref ** (-0.5)\n",
-    "ax.plot(eps_ref, tau_brugg, \"--\", color=\"gray\", linewidth=2, label=\"Bruggeman (τ = ε⁻⁰·⁵)\")\n",
+    "ax.plot(eps_ref, tau_brugg, \"--\", color=\"gray\", linewidth=2, label=\"Bruggeman (\u03c4 = \u03b5\u207b\u2070\u00b7\u2075)\")\n",
     "\n",
     "# Measured values\n",
     "df_valid = df_porosity[df_porosity[\"percolates\"]]\n",
@@ -512,9 +496,9 @@
     "    ax.scatter(df_noperc[\"porosity\"], [0.5] * len(df_noperc), marker=\"x\",\n",
     "               s=100, color=\"black\", zorder=5, label=\"Non-percolating\")\n",
     "\n",
-    "ax.set_xlabel(\"Porosity (ε)\")\n",
-    "ax.set_ylabel(\"Tortuosity (τ)\")\n",
-    "ax.set_title(\"Tortuosity vs. Porosity — 128³ synthetic microstructures\")\n",
+    "ax.set_xlabel(\"Porosity (\u03b5)\")\n",
+    "ax.set_ylabel(\"Tortuosity (\u03c4)\")\n",
+    "ax.set_title(\"Tortuosity vs. Porosity \u2014 128\u00b3 synthetic microstructures\")\n",
     "ax.legend()\n",
     "plt.tight_layout()\n",
     "plt.show()"
@@ -524,23 +508,350 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The Bruggeman relation (τ = ε⁻⁰·⁵) is a simple effective-medium approximation. Real (or realistic synthetic) microstructures typically deviate due to pore connectivity, bottlenecks, and geometric tortuosity effects that the Bruggeman model does not capture."
+    "The Bruggeman relation (\u03c4 = \u03b5\u207b\u2070\u00b7\u2075) is a simple effective-medium approximation. Real (or realistic synthetic) microstructures typically deviate due to pore connectivity, bottlenecks, and geometric tortuosity effects that the Bruggeman model does not capture."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": "## Section 8: AMReX TinyProfiler \u2014 Internal Breakdown\n\nAMReX ships with a built-in **TinyProfiler** that hooks into every `BL_PROFILE`-annotated function in the C++ source. OpenImpala instruments 24 key functions \u2014 solver setup, matrix assembly, HYPRE solve, flux computation, and more.\n\nWhen the AMReX session finalizes, TinyProfiler prints a breakdown table to stdout. We capture this output by restarting the session with verbose output, running a solve, then parsing the profiler dump.\n\n**This section answers #31's core questions:**\n- Solver iterations vs. setup time\n- Matrix assembly vs. linear solve\n- Flux computation vs. everything else",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": "# Close the current session so TinyProfiler dumps its output,\n# then restart with verbose output to capture the profiler table.\n\nimport io\nimport sys\nimport re\n\n# Close existing session cleanly\nsession.__exit__(None, None, None)\n\ndef run_with_profiler(data, solver=\"pcg\", direction=\"z\", max_grid_size=32):\n    \"\"\"Run a solve in a fresh AMReX session and capture TinyProfiler output.\"\"\"\n    # Capture stdout during the entire session lifecycle\n    captured = io.StringIO()\n    old_stdout = sys.stdout\n\n    # Start fresh session\n    s = oi.Session()\n    s.__enter__()\n\n    # Run the solve with verbose=1 to get internal timing prints\n    res = oi.tortuosity(data, phase=0, direction=direction,\n                        solver=solver, max_grid_size=max_grid_size, verbose=1)\n\n    # Finalize \u2014 this is when TinyProfiler prints its summary\n    sys.stdout = captured\n    s.__exit__(None, None, None)\n    sys.stdout = old_stdout\n\n    return res, captured.getvalue()\n\n# Run profiled solve on 128\u00b3 dataset\nprint(f\"Running profiled solve (128\u00b3, solver={best_solver})...\")\nprof_result, prof_output = run_with_profiler(data_medium, solver=best_solver)\nprint(f\"Tortuosity: {prof_result.tortuosity:.4f}\")\nprint(f\"Converged:  {prof_result.solver_converged}\")\nprint(f\"\\nTinyProfiler output length: {len(prof_output)} chars\")\n\n# Show raw output for inspection\nif prof_output.strip():\n    print(\"\\n--- Raw TinyProfiler Output ---\")\n    print(prof_output[:3000])\nelse:\n    print(\"\\nNote: TinyProfiler output may be empty if AMReX was built without\")\n    print(\"TINY_PROFILE support. The wall-time profiling below still works.\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": "def parse_tiny_profiler(output):\n    \"\"\"Parse AMReX TinyProfiler output into a DataFrame.\n\n    TinyProfiler prints tables like:\n        Name                          NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %\n        TortuosityHypre::solve             1     0.45       0.45       0.45     52.3%\n    \"\"\"\n    records = []\n    # Match lines with function name, ncalls, and timing columns\n    pattern = re.compile(\n        r\"^\\s*(\\S+.*\\S)\\s+\"       # function name\n        r\"(\\d+)\\s+\"               # NCalls\n        r\"([\\d.e+-]+)\\s+\"         # Excl. Min\n        r\"([\\d.e+-]+)\\s+\"         # Excl. Avg\n        r\"([\\d.e+-]+)\\s+\"         # Excl. Max\n        r\"([\\d.e+-]+)\\s*%\",       # Max %\n        re.MULTILINE\n    )\n    for m in pattern.finditer(output):\n        records.append({\n            \"function\": m.group(1).strip(),\n            \"ncalls\": int(m.group(2)),\n            \"excl_min_s\": float(m.group(3)),\n            \"excl_avg_s\": float(m.group(4)),\n            \"excl_max_s\": float(m.group(5)),\n            \"max_pct\": float(m.group(6)),\n        })\n    return pd.DataFrame(records)\n\ndf_prof = parse_tiny_profiler(prof_output)\n\nif len(df_prof) > 0:\n    df_prof = df_prof.sort_values(\"excl_avg_s\", ascending=False).head(15)\n    print(f\"Top {len(df_prof)} functions by exclusive time:\")\n    print(df_prof.to_string(index=False))\nelse:\n    # Fall back to wall-time breakdown if TinyProfiler isn't available\n    print(\"TinyProfiler data not available \u2014 using wall-time phase breakdown instead.\")\n    print(\"Running phase-by-phase timing...\")\n\n    # Restart session for manual timing\n    session = oi.Session()\n    session.__enter__()\n\n    # Phase 1: Volume fraction + percolation check (pre-solve)\n    t0 = time.perf_counter()\n    vf = oi.volume_fraction(data_medium, phase=0)\n    perc = oi.percolation_check(data_medium, phase=0, direction=\"z\")\n    t_presolve = time.perf_counter() - t0\n\n    # Phase 2: Full tortuosity solve (includes setup + matrix fill + solve + flux)\n    t0 = time.perf_counter()\n    res = oi.tortuosity(data_medium, phase=0, direction=\"z\",\n                        solver=best_solver, max_grid_size=best_grid_size, verbose=0)\n    t_solve_total = time.perf_counter() - t0\n\n    # Phase 3: Repeat at different sizes to estimate setup vs compute scaling\n    t0 = time.perf_counter()\n    res_small = oi.tortuosity(data_small, phase=0, direction=\"z\",\n                              solver=best_solver, max_grid_size=best_grid_size, verbose=0)\n    t_solve_small = time.perf_counter() - t0\n\n    # Estimate: setup overhead \u2248 time that doesn't scale with problem size\n    # If 64\u00b3 takes t1 and 128\u00b3 takes t2, then t2/t1 \u2248 scaling factor\n    # The \"flat\" part (setup + HYPRE init) can be estimated\n    df_prof = pd.DataFrame([\n        {\"phase\": \"Pre-solve checks\", \"time_s\": t_presolve},\n        {\"phase\": f\"Full solve (128\u00b3)\", \"time_s\": t_solve_total},\n        {\"phase\": f\"Full solve (64\u00b3)\", \"time_s\": t_solve_small},\n    ])\n    scale_ratio = t_solve_total / max(t_solve_small, 1e-9)\n    print(f\"\\n  Pre-solve (VF + percolation): {t_presolve:.4f}s\")\n    print(f\"  Full solve 64\u00b3:               {t_solve_small:.4f}s\")\n    print(f\"  Full solve 128\u00b3:              {t_solve_total:.4f}s\")\n    print(f\"  Scaling ratio (128/64):       {scale_ratio:.2f}x\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "# Visualize the profiler breakdown\n",
+    "if \"max_pct\" in df_prof.columns and len(df_prof) > 0:\n",
+    "    # TinyProfiler horizontal bar chart\n",
+    "    fig, ax = plt.subplots(figsize=(10, max(4, len(df_prof) * 0.4)))\n",
+    "\n",
+    "    # Color by category\n",
+    "    def categorize(name):\n",
+    "        name_l = name.lower()\n",
+    "        if \"solve\" in name_l:\n",
+    "            return \"#e74c3c\"  # red\n",
+    "        elif any(k in name_l for k in [\"setup\", \"fill\", \"matrix\", \"stencil\"]):\n",
+    "            return \"#f39c12\"  # orange\n",
+    "        elif any(k in name_l for k in [\"flux\", \"plane\"]):\n",
+    "            return \"#3498db\"  # blue\n",
+    "        elif any(k in name_l for k in [\"mask\", \"precondition\", \"flood\", \"percolation\"]):\n",
+    "            return \"#2ecc71\"  # green\n",
+    "        else:\n",
+    "            return \"#95a5a6\"  # gray\n",
+    "\n",
+    "    colors = [categorize(f) for f in df_prof[\"function\"]]\n",
+    "    y_pos = np.arange(len(df_prof))\n",
+    "\n",
+    "    ax.barh(y_pos, df_prof[\"excl_avg_s\"], color=colors, alpha=0.85,\n",
+    "            edgecolor=\"white\", linewidth=0.8)\n",
+    "    for i, (_, row) in enumerate(df_prof.iterrows()):\n",
+    "        ax.text(row[\"excl_avg_s\"] + 0.001, i, f'{row[\"max_pct\"]:.1f}%',\n",
+    "                va=\"center\", fontsize=9, color=\"#2c3e50\")\n",
+    "    ax.set_yticks(y_pos)\n",
+    "    ax.set_yticklabels(df_prof[\"function\"], fontsize=9)\n",
+    "    ax.set_xlabel(\"Exclusive Time (s)\")\n",
+    "    ax.set_title(\"AMReX TinyProfiler \u2014 Function Breakdown\", fontsize=13, fontweight=\"bold\")\n",
+    "    ax.invert_yaxis()\n",
+    "\n",
+    "    from matplotlib.patches import Patch\n",
+    "    legend_elements = [\n",
+    "        Patch(facecolor=\"#e74c3c\", label=\"Linear solve\"),\n",
+    "        Patch(facecolor=\"#f39c12\", label=\"Matrix assembly/setup\"),\n",
+    "        Patch(facecolor=\"#3498db\", label=\"Flux computation\"),\n",
+    "        Patch(facecolor=\"#2ecc71\", label=\"Pre-processing\"),\n",
+    "        Patch(facecolor=\"#95a5a6\", label=\"Other\"),\n",
+    "    ]\n",
+    "    ax.legend(handles=legend_elements, loc=\"lower right\", framealpha=0.9)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "    # Pie chart of time categories\n",
+    "    categories = {\"Linear solve\": 0, \"Matrix assembly\": 0,\n",
+    "                  \"Flux computation\": 0, \"Pre-processing\": 0, \"Other\": 0}\n",
+    "    for _, row in df_prof.iterrows():\n",
+    "        name_l = row[\"function\"].lower()\n",
+    "        if \"solve\" in name_l:\n",
+    "            categories[\"Linear solve\"] += row[\"excl_avg_s\"]\n",
+    "        elif any(k in name_l for k in [\"setup\", \"fill\", \"matrix\", \"stencil\"]):\n",
+    "            categories[\"Matrix assembly\"] += row[\"excl_avg_s\"]\n",
+    "        elif any(k in name_l for k in [\"flux\", \"plane\"]):\n",
+    "            categories[\"Flux computation\"] += row[\"excl_avg_s\"]\n",
+    "        elif any(k in name_l for k in [\"mask\", \"precondition\", \"flood\", \"percolation\"]):\n",
+    "            categories[\"Pre-processing\"] += row[\"excl_avg_s\"]\n",
+    "        else:\n",
+    "            categories[\"Other\"] += row[\"excl_avg_s\"]\n",
+    "\n",
+    "    cat_colors = [\"#e74c3c\", \"#f39c12\", \"#3498db\", \"#2ecc71\", \"#95a5a6\"]\n",
+    "    fig, ax = plt.subplots(figsize=(7, 7))\n",
+    "    wedges, texts, autotexts = ax.pie(\n",
+    "        categories.values(), labels=categories.keys(), colors=cat_colors,\n",
+    "        autopct=\"%1.1f%%\", startangle=90, textprops={\"fontsize\": 11})\n",
+    "    for t in autotexts:\n",
+    "        t.set_fontweight(\"bold\")\n",
+    "    ax.set_title(\"Time Distribution by Category\", fontsize=13, fontweight=\"bold\")\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "else:\n",
+    "    # Fallback: bar chart from manual phase timing\n",
+    "    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))\n",
+    "\n",
+    "    phases = df_prof[\"phase\"].tolist()\n",
+    "    times_s = df_prof[\"time_s\"].tolist()\n",
+    "    colors = [\"#2ecc71\", \"#e74c3c\", \"#3498db\"]\n",
+    "\n",
+    "    ax1.barh(phases, times_s, color=colors[:len(phases)], alpha=0.85)\n",
+    "    ax1.set_xlabel(\"Wall Time (s)\")\n",
+    "    ax1.set_title(\"Phase Timing Breakdown\", fontweight=\"bold\")\n",
+    "\n",
+    "    if len(times_s) >= 3:\n",
+    "        ax2.bar([\"64\\u00b3\", \"128\\u00b3\"], [times_s[2], times_s[1]], color=\"#e74c3c\", alpha=0.85)\n",
+    "        ax2.bar([\"64\\u00b3\", \"128\\u00b3\"], [times_s[0], times_s[0]],\n",
+    "                bottom=[times_s[2], times_s[1]], color=\"#2ecc71\", alpha=0.85,\n",
+    "                label=\"Pre-solve overhead\")\n",
+    "        ax2.set_ylabel(\"Wall Time (s)\")\n",
+    "        ax2.set_title(\"Setup vs Compute Scaling\", fontweight=\"bold\")\n",
+    "        ax2.legend()\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Section 8: Summary & HPC Recommendations"
+    "## Section 9: NVIDIA Nsight Systems \u2014 GPU Kernel Profiling\n",
+    "\n",
+    "If a GPU is available, we use **NVIDIA Nsight Systems** (`nsys`) to capture a\n",
+    "detailed timeline of CUDA kernel launches, memory transfers, and API calls.\n",
+    "This reveals:\n",
+    "\n",
+    "- **GPU kernel occupancy** \u2014 are kernels large enough to saturate the GPU?\n",
+    "- **Host-device transfers** \u2014 how much time is spent copying data?\n",
+    "- **Kernel launch overhead** \u2014 are there too many tiny kernel launches?\n",
+    "- **Idle gaps** \u2014 is the GPU waiting on CPU work between kernels?\n",
+    "\n",
+    "> Colab provides `nsys` pre-installed on GPU runtimes. On HPC clusters,\n",
+    "> load it via `module load nsight-systems`."
    ]
   },
   {
    "cell_type": "code",
+   "metadata": {},
+   "outputs": [],
    "execution_count": null,
+   "source": [
+    "import shutil\n",
+    "import os\n",
+    "\n",
+    "nsys_available = shutil.which(\"nsys\") is not None and bool(gpu_name)\n",
+    "\n",
+    "if nsys_available:\n",
+    "    print(f\"nsys found: {shutil.which('nsys')}\")\n",
+    "    print(f\"GPU: {gpu_name}\")\n",
+    "    print(\"\\nRunning Nsight Systems profile...\")\n",
+    "\n",
+    "    # Write a self-contained profiling script\n",
+    "    script = '''\n",
+    "import openimpala as oi\n",
+    "import numpy as np\n",
+    "import porespy as ps\n",
+    "\n",
+    "np.random.seed(42)\n",
+    "data = ps.generators.blobs(shape=[128, 128, 128], porosity=0.5, blobiness=1.5)\n",
+    "data = data.astype(np.int32)\n",
+    "\n",
+    "with oi.Session():\n",
+    "    res = oi.tortuosity(data, phase=0, direction=\"z\", solver=\"pcg\",\n",
+    "                        max_grid_size=64, verbose=0)\n",
+    "    print(f\"tau={res.tortuosity:.4f} converged={res.solver_converged}\")\n",
+    "'''\n",
+    "    with open('/tmp/oi_profile.py', 'w') as f:\n",
+    "        f.write(script)\n",
+    "\n",
+    "    # Run nsys profile \u2014 capture CUDA API, kernels, and memory ops\n",
+    "    !nsys profile \\\n",
+    "        --output=/tmp/oi_profile \\\n",
+    "        --force-overwrite=true \\\n",
+    "        --trace=cuda,nvtx,osrt \\\n",
+    "        --cuda-memory-usage=true \\\n",
+    "        --stats=true \\\n",
+    "        python3 /tmp/oi_profile.py 2>&1 | tail -80\n",
+    "\n",
+    "    print(\"\\nProfile saved to /tmp/oi_profile.nsys-rep\")\n",
+    "    print(\"Download and open in Nsight Systems GUI for interactive analysis.\")\n",
+    "else:\n",
+    "    if not gpu_name:\n",
+    "        print(\"No GPU detected \u2014 skipping Nsight Systems profiling.\")\n",
+    "        print(\"To enable: Runtime > Change runtime type > T4 GPU\")\n",
+    "    else:\n",
+    "        print(\"nsys not found \u2014 install via: apt-get install nsight-systems\")\n",
+    "    print(\"\\nSkipping to summary.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
    "metadata": {},
    "outputs": [],
+   "execution_count": null,
    "source": [
-    "# Estimated time for 256³ from measured data\n",
+    "if nsys_available:\n",
+    "    # Parse the nsys stats output for CUDA kernel summary\n",
+    "    nsys_stats = subprocess.run(\n",
+    "        [\"nsys\", \"stats\", \"--report\", \"cuda_gpu_kern_sum\",\n",
+    "         \"--format\", \"csv\", \"/tmp/oi_profile.nsys-rep\"],\n",
+    "        capture_output=True, text=True, timeout=60\n",
+    "    )\n",
+    "\n",
+    "    if nsys_stats.returncode == 0 and nsys_stats.stdout.strip():\n",
+    "        # Parse CSV output\n",
+    "        from io import StringIO\n",
+    "        import csv\n",
+    "\n",
+    "        lines = nsys_stats.stdout.strip().split('\\n')\n",
+    "        # Find the header line (contains 'Time')\n",
+    "        header_idx = None\n",
+    "        for i, line in enumerate(lines):\n",
+    "            if 'Time' in line and 'Name' in line:\n",
+    "                header_idx = i\n",
+    "                break\n",
+    "\n",
+    "        if header_idx is not None:\n",
+    "            csv_text = '\\n'.join(lines[header_idx:])\n",
+    "            df_kernels = pd.read_csv(StringIO(csv_text))\n",
+    "            # Show top kernels by total time\n",
+    "            if 'Total Time (ns)' in df_kernels.columns:\n",
+    "                df_kernels['Total Time (ms)'] = df_kernels['Total Time (ns)'] / 1e6\n",
+    "                df_kernels = df_kernels.sort_values('Total Time (ns)', ascending=False).head(10)\n",
+    "                print(\"Top 10 CUDA Kernels by Total GPU Time:\")\n",
+    "                print(df_kernels[['Name', 'Total Time (ms)', 'Instances']].to_string(index=False))\n",
+    "            else:\n",
+    "                print(\"Kernel summary:\")\n",
+    "                print(df_kernels.head(10).to_string(index=False))\n",
+    "        else:\n",
+    "            print(\"Could not parse nsys kernel stats.\")\n",
+    "            print(nsys_stats.stdout[:2000])\n",
+    "    else:\n",
+    "        print(\"nsys stats returned no kernel data.\")\n",
+    "        if nsys_stats.stderr:\n",
+    "            print(nsys_stats.stderr[:500])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "if nsys_available:\n",
+    "    # Also get memory transfer stats\n",
+    "    nsys_mem = subprocess.run(\n",
+    "        [\"nsys\", \"stats\", \"--report\", \"cuda_gpu_mem_size_sum\",\n",
+    "         \"--format\", \"csv\", \"/tmp/oi_profile.nsys-rep\"],\n",
+    "        capture_output=True, text=True, timeout=60\n",
+    "    )\n",
+    "\n",
+    "    if nsys_mem.returncode == 0 and nsys_mem.stdout.strip():\n",
+    "        lines = nsys_mem.stdout.strip().split('\\n')\n",
+    "        header_idx = None\n",
+    "        for i, line in enumerate(lines):\n",
+    "            if 'Operation' in line or 'Total' in line:\n",
+    "                header_idx = i\n",
+    "                break\n",
+    "        if header_idx is not None:\n",
+    "            csv_text = '\\n'.join(lines[header_idx:])\n",
+    "            df_mem = pd.read_csv(StringIO(csv_text))\n",
+    "            print(\"\\nCUDA Memory Transfer Summary:\")\n",
+    "            print(df_mem.to_string(index=False))\n",
+    "        else:\n",
+    "            print(\"\\nMemory transfer data:\")\n",
+    "            print(nsys_mem.stdout[:1000])\n",
+    "\n",
+    "    # Visualize kernel distribution if we have data\n",
+    "    try:\n",
+    "        if 'df_kernels' in dir() and len(df_kernels) > 0 and 'Total Time (ms)' in df_kernels.columns:\n",
+    "            fig, ax = plt.subplots(figsize=(10, max(3, len(df_kernels) * 0.35)))\n",
+    "\n",
+    "            # Shorten kernel names for display\n",
+    "            short_names = []\n",
+    "            for n in df_kernels['Name']:\n",
+    "                # Extract the last meaningful part of mangled kernel names\n",
+    "                parts = str(n).split('::')\n",
+    "                short = parts[-1] if len(parts) > 1 else str(n)\n",
+    "                short = short[:50] + '...' if len(short) > 50 else short\n",
+    "                short_names.append(short)\n",
+    "\n",
+    "            y_pos = np.arange(len(df_kernels))\n",
+    "            ax.barh(y_pos, df_kernels['Total Time (ms)'].values,\n",
+    "                    color='#e67e22', alpha=0.85, edgecolor='white')\n",
+    "            ax.set_yticks(y_pos)\n",
+    "            ax.set_yticklabels(short_names, fontsize=8)\n",
+    "            ax.set_xlabel('Total GPU Time (ms)')\n",
+    "            ax.set_title('CUDA Kernel Time Distribution', fontsize=13, fontweight='bold')\n",
+    "            ax.invert_yaxis()\n",
+    "            plt.tight_layout()\n",
+    "            plt.show()\n",
+    "    except Exception as e:\n",
+    "        print(f\"Could not plot kernel distribution: {e}\")\n",
+    "\n",
+    "    print(\"\\nFor interactive analysis, download /tmp/oi_profile.nsys-rep\")\n",
+    "    print(\"and open it in the Nsight Systems GUI (free from NVIDIA).\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Interpreting Nsight Systems results\n",
+    "\n",
+    "**Key things to look for:**\n",
+    "\n",
+    "| Pattern | Meaning | Action |\n",
+    "|---------|---------|--------|\n",
+    "| Many tiny kernels (<1 ms each) | Kernel launch overhead dominates | Increase `max_grid_size` to fuse work |\n",
+    "| Large H2D/D2H transfers | Data copying bottleneck | Check if arrays are being copied repeatedly |\n",
+    "| Long gaps between kernels | CPU is the bottleneck | Look for serial CPU work between GPU calls |\n",
+    "| One dominant kernel (>80% time) | Solver kernel is the hot path | Focus optimization there (expected for large problems) |\n",
+    "| Even distribution across kernels | Setup/assembly is significant | Consider caching matrix assembly across solves |\n",
+    "\n",
+    "**On HPC clusters** with multiple GPUs, combine `nsys` with MPI profiling:\n",
+    "```bash\n",
+    "mpirun -np 4 nsys profile --output=profile_rank%q{OMPI_COMM_WORLD_RANK} \\\n",
+    "    openimpala inputs_256.txt\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Section 10: Summary & HPC Recommendations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Estimated time for 256\u00b3 from measured data\n",
     "time_256 = df_size.loc[df_size[\"size\"] == 256, \"wall_time_s\"].values[0]\n",
     "\n",
     "print(\"=\" * 60)\n",
@@ -548,13 +859,13 @@
     "print(\"=\" * 60)\n",
     "print(f\"  Best solver:          {best_solver}\")\n",
     "print(f\"  Optimal max_grid_size: {best_grid_size}\")\n",
-    "print(f\"  Time per 256³ solve:  {time_256:.2f} s\")\n",
+    "print(f\"  Time per 256\u00b3 solve:  {time_256:.2f} s\")\n",
     "\n",
     "# Memory regime estimate\n",
-    "for label, size in [(\"256³\", 256), (\"400³\", 400), (\"512³\", 512)]:\n",
+    "for label, size in [(\"256\u00b3\", 256), (\"400\u00b3\", 400), (\"512\u00b3\", 512)]:\n",
     "    mem_gb = (size ** 3 * 8 * 3) / 1e9  # rough: 3 arrays of float64\n",
     "    regime = \"safe\" if mem_gb < 4 else (\"caution\" if mem_gb < 10 else \"DANGER\")\n",
-    "    print(f\"  Memory {label}: ~{mem_gb:.1f} GB — [{regime}]\")\n",
+    "    print(f\"  Memory {label}: ~{mem_gb:.1f} GB \u2014 [{regime}]\")\n",
     "print(\"=\" * 60)"
    ]
   },
@@ -573,7 +884,7 @@
     "for size in [256, 512, 1024, 2048]:\n",
     "    n = size ** 3\n",
     "    est_time = 10 ** np.polyval(coeffs, np.log10(n))\n",
-    "    print(f\"{size:>5d}³  {n:>12,d}  {est_time:>13.1f}\")\n",
+    "    print(f\"{size:>5d}\u00b3  {n:>12,d}  {est_time:>13.1f}\")\n",
     "\n",
     "print(\"-\" * 50)\n",
     "print(\"Note: These are rough estimates from a single T4.\")\n",
@@ -620,8 +931,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "session.__exit__(None, None, None)\n",
-    "print(\"OpenImpala session closed.\")"
+    "# Clean up \u2014 session may already be closed from TinyProfiler section\n",
+    "try:\n",
+    "    from openimpala._core import amrex_initialized\n",
+    "    if amrex_initialized():\n",
+    "        session.__exit__(None, None, None)\n",
+    "        print(\"OpenImpala session closed.\")\n",
+    "    else:\n",
+    "        print(\"Session already finalized (closed during TinyProfiler section).\")\n",
+    "except Exception:\n",
+    "    print(\"Session cleanup complete.\")"
    ]
   }
  ]

From 23c770cae4c1e5cafc9f0404286dc05413207ae4 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.lehoux@gre.ac.uk>
Date: Sun, 29 Mar 2026 20:12:37 +0000
Subject: [PATCH 6/6] Fix PyPI version rejection: add setuptools_scm config

The GPU wheel uploads fail because setuptools_scm generates local
version identifiers (e.g. 4.0.2.dev0+ga780d0e87) which PyPI rejects.
Root cause: no v4.x tag exists in the repo (latest is v3.1.0).

Add [tool.setuptools_scm] with local_scheme="no-local-version" to
produce PyPI-compatible versions even on non-tagged commits, and
set fallback_version for builds outside a git repo.

https://claude.ai/code/session_01RKnn97qiD7sbCeABHH3eQk
---
 pyproject.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2044154..11efe36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,5 +51,12 @@ install.components = ["python"]
 [tool.scikit-build.metadata.version]
 provider = "scikit_build_core.metadata.setuptools_scm"
 
+[tool.setuptools_scm]
+# Produce PyPI-compatible versions even on non-tagged commits.
+# Without this, setuptools_scm appends "+gXXXX" local identifiers
+# that PyPI rejects with HTTP 400.
+local_scheme = "no-local-version"
+fallback_version = "4.0.2"
+
 [tool.pytest.ini_options]
 testpaths = ["python/tests"]