diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
new file mode 100644
index 00000000..acd1f3cd
--- /dev/null
+++ b/.github/release-drafter.yml
@@ -0,0 +1,133 @@
+# Configuration for Release Drafter
+# https://github.com/release-drafter/release-drafter
+#
+# Automatically drafts GitHub release notes from merged PRs.
+# PRs are categorized by their labels into changelog sections.
+
+name-template: 'v$RESOLVED_VERSION'
+tag-template: 'v$RESOLVED_VERSION'
+
+# Determine the next version bump from PR labels
+version-resolver:
+  major:
+    labels:
+      - 'breaking'
+  minor:
+    labels:
+      - 'enhancement'
+      - 'feature'
+      - 'physics'
+  patch:
+    labels:
+      - 'bug'
+      - 'fix'
+      - 'performance'
+      - 'documentation'
+      - 'devops'
+      - 'dependencies'
+  default: patch
+
+# Map PR labels to changelog sections
+categories:
+  - title: '🔬 Physics & Solvers'
+    labels:
+      - 'physics'
+      - 'solver'
+  - title: '🚀 New Features'
+    labels:
+      - 'feature'
+      - 'enhancement'
+  - title: '⚡ Performance'
+    labels:
+      - 'performance'
+      - 'gpu'
+  - title: '🐛 Bug Fixes'
+    labels:
+      - 'bug'
+      - 'fix'
+  - title: '📖 Documentation'
+    labels:
+      - 'documentation'
+  - title: '🔧 DevOps & CI'
+    labels:
+      - 'devops'
+      - 'ci'
+  - title: '📦 Dependencies'
+    labels:
+      - 'dependencies'
+  - title: '⚠️ Breaking Changes'
+    labels:
+      - 'breaking'
+  - title: '🧹 Maintenance'
+    labels:
+      - 'maintenance'
+      - 'refactor'
+
+# Exclude PRs with these labels from release notes
+exclude-labels:
+  - 'skip-changelog'
+
+# Template for the release body
+template: |
+  ## What's Changed
+
+  $CHANGES
+
+  **Full Changelog**: https://github.com/$OWNER/$REPOSITORY/compare/$PREVIOUS_TAG...v$RESOLVED_VERSION
+
+# Auto-label PRs based on file paths
+autolabeler:
+  - label: 'physics'
+    files:
+      - 'src/props/**'
+    title:
+      - '/tortuosity/i'
+      - '/diffusiv/i'
+      - '/solver/i'
+      - '/HYPRE/i'
+      - '/MLMG/i'
+  - label: 'io'
+    files:
+      - 'src/io/**'
+    title:
+      - '/reader/i'
+      - '/TIFF/i'
+      - '/HDF5/i'
+  - label: 'documentation'
+    files:
+      - 'docs/**'
+      - '*.md'
+      - 'Doxyfile'
+    title:
+      - '/docs/i'
+      - '/documentation/i'
+  - label: 'devops'
+    files:
+      - '.github/**'
+      - 'containers/**'
+      - 'pyproject.toml'
+      - 'CMakeLists.txt'
+    title:
+      - '/CI/i'
+      - '/workflow/i'
+      - '/wheel/i'
+      - '/PyPI/i'
+  - label: 'gpu'
+    files:
+      - 'src/props/*GPU*'
+      - 'src/props/*CUDA*'
+    title:
+      - '/CUDA/i'
+      - '/GPU/i'
+      - '/NVCC/i'
+  - label: 'python'
+    files:
+      - 'python/**'
+    title:
+      - '/python/i'
+      - '/pybind/i'
+      - '/binding/i'
+  - label: 'tests'
+    files:
+      - 'tests/**'
+      - 'python/tests/**'
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 9e59ba82..0f45bb0c 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,11 +1,12 @@
 # .github/workflows/docs.yml
-name: Deploy Doxygen Documentation
+name: Deploy Documentation
 
 on:
   push:
     branches: [master]
     paths:
       - 'src/**'
+      - 'python/**'
       - 'Doxyfile'
       - 'docs/**'
       - '.github/workflows/docs.yml'
@@ -24,7 +25,7 @@ concurrency:
 
 jobs:
   build-docs:
-    name: Build Doxygen Documentation
+    name: Build Documentation
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
@@ -35,15 +36,28 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y doxygen graphviz
           echo "Doxygen version: $(doxygen --version)"
-          echo "Dot version: $(dot -V 2>&1)"
 
-      - name: Generate documentation
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Sphinx and dependencies
+        run: pip install -r docs/requirements.txt
+
+      - name: Generate Doxygen XML and HTML
         run: doxygen Doxyfile
 
+      - name: Build Sphinx documentation
+        run: sphinx-build -b html docs/ docs/_build/html
+
+      - name: Copy Doxygen HTML into Sphinx output
+        run: cp -r docs/doxygen/html docs/_build/html/doxygen
+
       - name: Upload Pages artifact
         uses: actions/upload-pages-artifact@v3
         with:
-          path: docs/doxygen/html
+          path: docs/_build/html
 
   deploy:
     name: Deploy to GitHub Pages
diff --git a/.github/workflows/pypi-wheels-cpu.yml b/.github/workflows/pypi-wheels-cpu.yml
index e3f85ec6..9fd6762b 100644
--- a/.github/workflows/pypi-wheels-cpu.yml
+++ b/.github/workflows/pypi-wheels-cpu.yml
@@ -16,6 +16,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive # Fetches Catch2, nlohmann/json, or pybind11 if needed
+          fetch-depth: 0
 
       - name: Set up Python
         uses: actions/setup-python@v5
diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
index 82b22274..f7023bf7 100644
--- a/.github/workflows/pypi-wheels-gpu.yml
+++ b/.github/workflows/pypi-wheels-gpu.yml
@@ -16,6 +16,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive
+          fetch-depth: 0
 
       - name: Set up Python
         uses: actions/setup-python@v5
diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml
new file mode 100644
index 00000000..9d6097b9
--- /dev/null
+++ b/.github/workflows/release-drafter.yml
@@ -0,0 +1,21 @@
+# .github/workflows/release-drafter.yml
+name: Release Drafter
+
+on:
+  push:
+    branches:
+      - master
+  pull_request_target:
+    types: [opened, reopened, synchronize]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  update-release-draft:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: release-drafter/release-drafter@v6
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 0c5db227..44e4f6c2 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,6 +15,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Set up Apptainer
         uses: eWaterCycle/setup-apptainer@v2
@@ -121,10 +123,14 @@ jobs:
           sudo apptainer build "$SIF_FILENAME" Singularity.final.def
           echo "SIF_FILENAME=$SIF_FILENAME" >> $GITHUB_ENV
           
-      - name: Create GitHub Release and Upload SIF
+      - name: Upload SIF to GitHub Release
         uses: softprops/action-gh-release@v2
         with:
           files: ${{ env.SIF_FILENAME }}
+          # Release notes are pre-populated by Release Drafter.
+          # Only fall back to auto-generated notes if the body is empty
+          # (e.g. tag was pushed without a prior draft).
           generate_release_notes: true
+          append_body: true
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16aca611..a0e29311 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,37 @@
 
 cmake_minimum_required(VERSION 3.18)
 
+# ---------------------------------------------------------------------------
+# Derive project version from the latest Git tag (e.g. v4.0.1 → 4.0.1).
+# Falls back to 0.0.0 when building outside a Git repository or when no
+# tag is reachable (e.g. shallow clone without --tags).
+# ---------------------------------------------------------------------------
+set(OPENIMPALA_FALLBACK_VERSION "4.0.1")
+
+find_package(Git QUIET)
+if(GIT_FOUND)
+    execute_process(
+        COMMAND "${GIT_EXECUTABLE}" describe --tags --match "v[0-9]*" --abbrev=0
+        WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        OUTPUT_VARIABLE _git_tag
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+        RESULT_VARIABLE _git_result
+    )
+    if(_git_result EQUAL 0 AND _git_tag MATCHES "^v([0-9]+\\.[0-9]+\\.[0-9]+)")
+        set(_detected_version "${CMAKE_MATCH_1}")
+    endif()
+endif()
+
+if(NOT _detected_version)
+    set(_detected_version "${OPENIMPALA_FALLBACK_VERSION}")
+    message(STATUS "Git tag not found — using fallback version ${_detected_version}")
+else()
+    message(STATUS "Version from Git tag: ${_detected_version}")
+endif()
+
 project(OpenImpala
-    VERSION 0.1.0
+    VERSION ${_detected_version}
     LANGUAGES C CXX Fortran
     DESCRIPTION "Image-based simulation of transport properties in porous media"
 )
diff --git a/Doxyfile b/Doxyfile
index cced28a7..770d29ed 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -64,7 +64,8 @@ FULL_SIDEBAR           = NO
 GENERATE_LATEX         = NO
 GENERATE_RTF           = NO
 GENERATE_MAN           = NO
-GENERATE_XML           = NO
+GENERATE_XML           = YES
+XML_OUTPUT             = xml
 
 #---------------------------------------------------------------------------
 # Graphs (requires Graphviz dot)
diff --git a/docs/api/cpp.md b/docs/api/cpp.md
new file mode 100644
index 00000000..ef7be91d
--- /dev/null
+++ b/docs/api/cpp.md
@@ -0,0 +1,75 @@
+# C++ API Reference
+
+The C++ API reference is generated from Doxygen comments in the source code
+using [Breathe](https://breathe.readthedocs.io/).
+
+## Namespace
+
+All OpenImpala classes live in the `OpenImpala` namespace.
+
+## Key classes
+
+### I/O Readers
+
+```{eval-rst}
+.. doxygenclass:: OpenImpala::TiffReader
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::HDF5Reader
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::RawReader
+   :members:
+   :outline:
+```
+
+### Transport Solvers
+
+```{eval-rst}
+.. doxygenclass:: OpenImpala::TortuosityHypre
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::TortuosityMLMG
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::EffectiveDiffusivityHypre
+   :members:
+   :outline:
+```
+
+### Utilities
+
+```{eval-rst}
+.. doxygenclass:: OpenImpala::VolumeFraction
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::PercolationCheck
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::TortuositySolverBase
+   :members:
+   :outline:
+
+.. doxygenclass:: OpenImpala::HypreStructSolver
+   :members:
+   :outline:
+```
+
+### Configuration
+
+```{eval-rst}
+.. doxygenstruct:: OpenImpala::PhysicsConfig
+   :members:
+   :outline:
+```
+
+## Full Doxygen output
+
+For the complete class hierarchy, include dependency graphs, and file-level
+documentation, see the [Doxygen pages](../doxygen/html/index.html).
diff --git a/docs/api/python.rst b/docs/api/python.rst
new file mode 100644
index 00000000..415e12f1
--- /dev/null
+++ b/docs/api/python.rst
@@ -0,0 +1,50 @@
+Python API Reference
+====================
+
+High-level API
+--------------
+
+The recommended interface for most users. These functions accept NumPy arrays
+and return Python dataclasses.
+
+.. autofunction:: openimpala.facade.volume_fraction
+
+.. autofunction:: openimpala.facade.percolation_check
+
+.. autofunction:: openimpala.facade.tortuosity
+
+.. autofunction:: openimpala.facade.read_image
+
+
+Result types
+~~~~~~~~~~~~
+
+.. autoclass:: openimpala.facade.VolumeFractionResult
+   :members:
+
+.. autoclass:: openimpala.facade.PercolationResult
+   :members:
+
+.. autoclass:: openimpala.facade.TortuosityResult
+   :members:
+
+
+Session management
+------------------
+
+.. autoclass:: openimpala.Session
+   :members:
+   :special-members: __enter__, __exit__
+
+
+Exceptions
+----------
+
+.. autoclass:: openimpala.OpenImpalaError
+   :members:
+
+.. autoclass:: openimpala.ConvergenceError
+   :members:
+
+.. autoclass:: openimpala.PercolationError
+   :members:
diff --git a/docs/changelog.md b/docs/changelog.md
new file mode 100644
index 00000000..5727d94d
--- /dev/null
+++ b/docs/changelog.md
@@ -0,0 +1,43 @@
+# Changelog
+
+## v4.0.0 (2026-03-29)
+
+Major release introducing GPU acceleration, a new matrix-free solver,
+comprehensive architectural refactoring, and expanded tutorials.
+
+### Highlights
+
+- **CUDA GPU acceleration** via `openimpala-cuda` PyPI package
+- **TortuosityMLMG solver** — matrix-free AMReX geometric multigrid
+- **Microstructural parameterisation engine** — SSA, REV study, PSD, connected components
+- **Fortran-to-C++ kernel migration** — all compute kernels now native C++ AMReX lambdas
+- **7-part tutorial series** with Google Colab support
+
+See the full [release notes on GitHub](https://github.com/BASE-Laboratory/OpenImpala/releases/tag/v4.0.0).
+
+## v3.1.0 (2026-03-10)
+
+- Replaced `pyamrex` dependency with native C++ NumPy ingestion via `VoxelImage`
+- Self-contained PyPI wheels — `pip install openimpala` with zero compilation
+- Memory-safe workflows: ingest data, free Python array, then solve
+
+## v3.0.0 — v3.0.2
+
+- Python bindings via pybind11
+- CMake build system modernisation
+- scikit-build-core + cibuildwheel integration
+- Multi-phase transport support
+
+## v2.0.0 — v2.1.1
+
+- AMReX upgrade and CI/CD pipeline
+- Catch2 test framework integration
+- Code coverage with Codecov
+- clang-format and clang-tidy enforcement
+
+## v1.0.0 — v1.1.1
+
+- Initial public release
+- HYPRE-based tortuosity and effective diffusivity solvers
+- TIFF, HDF5, RAW, DAT image readers
+- Apptainer container builds
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000..073bf280
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,87 @@
+# Configuration file for the Sphinx documentation builder.
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import os
+import sys
+
+# -- Path setup ---------------------------------------------------------------
+# Add the Python package to sys.path so autodoc can find it.
+sys.path.insert(0, os.path.abspath(os.path.join("..", "python")))
+
+# -- Project information ------------------------------------------------------
+project = "OpenImpala"
+copyright = "2024-2026, BASE Laboratory, University of Greenwich"
+author = "James Le Houx"
+
+# Version is read from pyproject.toml at build time; fallback for local builds.
+try:
+    from importlib.metadata import version as _version
+
+    release = _version("openimpala")
+except Exception:
+    release = "4.0.0"
+version = ".".join(release.split(".")[:2])
+
+# -- General configuration ----------------------------------------------------
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.mathjax",
+    "breathe",
+    "myst_parser",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "doxygen", "Thumbs.db", ".DS_Store"]
+
+# -- MyST (Markdown) settings -------------------------------------------------
+myst_enable_extensions = [
+    "colon_fence",
+    "deflist",
+    "fieldlist",
+]
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
+# -- Breathe (Doxygen bridge) -------------------------------------------------
+breathe_projects = {"OpenImpala": os.path.abspath("doxygen/xml")}
+breathe_default_project = "OpenImpala"
+
+# -- Autodoc settings ---------------------------------------------------------
+autodoc_mock_imports = ["openimpala._core", "mpi4py"]
+autodoc_member_order = "bysource"
+autodoc_typehints = "description"
+
+# -- Napoleon (Google/NumPy docstrings) ----------------------------------------
+napoleon_google_docstring = False
+napoleon_numpy_docstring = True
+napoleon_use_rtype = False
+
+# -- Intersphinx (cross-project links) ----------------------------------------
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+}
+
+# -- HTML output ---------------------------------------------------------------
+html_theme = "furo"
+html_title = "OpenImpala"
+html_static_path = ["_static"]
+
+html_theme_options = {
+    "source_repository": "https://github.com/BASE-Laboratory/OpenImpala",
+    "source_branch": "master",
+    "source_directory": "docs/",
+    "light_css_variables": {
+        "color-brand-primary": "#2962FF",
+        "color-brand-content": "#2962FF",
+    },
+}
+
+# -- Autosummary ---------------------------------------------------------------
+autosummary_generate = True
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 00000000..d1e6f2bd
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,69 @@
+# Contributing
+
+Contributions to OpenImpala are welcome. This guide covers the development
+workflow and coding standards.
+
+## Development setup
+
+```bash
+git clone https://github.com/BASE-Laboratory/OpenImpala.git
+cd OpenImpala
+git checkout -b my-feature
+
+# Build (inside the dependency container)
+apptainer exec --bind "$(pwd):/src" dependency_image.sif bash -c "cd /src && make all -j"
+
+# Run tests
+apptainer exec --bind "$(pwd):/src" dependency_image.sif bash -c "cd /src && make test"
+```
+
+## Pull request workflow
+
+1. Fork the repository and create a feature branch
+2. Make your changes, ensuring tests pass
+3. Run `clang-format` on modified files
+4. Submit a pull request against `master`
+
+## Code style
+
+- **C++17**, 100-column line limit, 4-space indentation
+- LLVM-based style enforced by `.clang-format`
+- All code in `namespace OpenImpala`
+- Headers use `#ifndef` include guards (not `#pragma once`)
+- Doxygen `@file` / `@brief` / `@param` comments on all public APIs
+- Fortran files are **not** processed by clang-format
+
+### Formatting check
+
+```bash
+# Check formatting (CI runs this automatically)
+find src/ tests/ python/bindings/ -type f \( -name "*.cpp" -o -name "*.H" \) \
+  | xargs clang-format --dry-run --Werror
+
+# Auto-format
+find src/ tests/ python/bindings/ -type f \( -name "*.cpp" -o -name "*.H" \) \
+  | xargs clang-format -i
+```
+
+## Testing
+
+- **C++ tests:** CTest with Catch2 (run via `ctest --output-on-failure`)
+- **Python tests:** pytest (`python -m pytest python/tests/`)
+- **Analytical benchmarks:** Uniform block, series layers, parallel layers with
+  exact solutions
+
+## Building documentation
+
+```bash
+# Install doc dependencies
+pip install -r docs/requirements.txt
+
+# Generate Doxygen XML (needed by Breathe)
+doxygen Doxyfile
+
+# Build Sphinx HTML
+sphinx-build -b html docs/ docs/_build/html
+
+# View locally
+open docs/_build/html/index.html
+```
diff --git a/docs/getting-started.md b/docs/getting-started.md
new file mode 100644
index 00000000..a0ba9d08
--- /dev/null
+++ b/docs/getting-started.md
@@ -0,0 +1,114 @@
+# Getting Started
+
+## Installation
+
+### Python (recommended)
+
+OpenImpala is available on PyPI as pre-compiled wheels — no compilation required.
+
+```bash
+# CPU version (works everywhere)
+pip install openimpala
+
+# GPU version (requires NVIDIA CUDA runtime)
+pip install openimpala-cuda
+```
+
+**Requirements:** Python 3.8+ and NumPy. Optional: `mpi4py` for MPI parallelism.
+
+### Container (HPC)
+
+For HPC clusters, download the pre-built Apptainer/Singularity container from
+[GitHub Releases](https://github.com/BASE-Laboratory/OpenImpala/releases):
+
+```bash
+# Download the latest .sif file
+wget https://github.com/BASE-Laboratory/OpenImpala/releases/latest/download/openimpala-v4.0.0.sif
+
+# Run interactively
+apptainer shell openimpala-v4.0.0.sif
+
+# Run a simulation
+apptainer exec openimpala-v4.0.0.sif /opt/OpenImpala/build/Diffusion3d inputs
+```
+
+### From source (developers)
+
+```bash
+git clone https://github.com/BASE-Laboratory/OpenImpala.git
+cd OpenImpala
+mkdir build && cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+         -DCMAKE_CXX_COMPILER=$(which mpicxx) \
+         -DCMAKE_Fortran_COMPILER=$(which mpif90)
+make -j$(nproc)
+ctest --output-on-failure
+```
+
+Dependencies: AMReX, HYPRE, HDF5, LibTIFF. See the
+[README](https://github.com/BASE-Laboratory/OpenImpala#native-installation-advanced)
+for full details.
+
+## Your first simulation
+
+```python
+import numpy as np
+import openimpala as oi
+
+# Create a simple porous medium (random 50/50 mix)
+data = np.random.choice([0, 1], size=(64, 64, 64), dtype=np.int32)
+
+with oi.Session():
+    # Volume fraction
+    vf = oi.volume_fraction(data, phase=1)
+    print(f"Volume fraction: {vf.fraction:.4f}")
+
+    # Percolation check
+    perc = oi.percolation_check(data, phase=1, direction="z")
+    print(f"Percolates: {perc.percolates}")
+
+    # Tortuosity (only if phase percolates)
+    if perc.percolates:
+        result = oi.tortuosity(data, phase=1, direction="z")
+        print(f"Tortuosity: {result.tortuosity:.4f}")
+```
+
+All computation happens inside the `oi.Session()` context manager, which
+manages the AMReX and MPI lifecycle.
+
+## Working with real images
+
+OpenImpala reads TIFF stacks, HDF5, and raw binary files:
+
+```python
+import openimpala as oi
+
+with oi.Session():
+    reader, img = oi.read_image("sample.tiff", threshold=128)
+    result = oi.tortuosity(img, phase=1, direction="z")
+```
+
+## Memory-safe workflows
+
+For large datasets, free the Python array before solving:
+
+```python
+import gc
+import numpy as np
+import openimpala as oi
+
+with oi.Session():
+    arr = np.load("large_volume.npy")
+    dataset = oi.core.VoxelImage.from_numpy(arr)
+
+    del arr
+    gc.collect()  # Free Python memory
+
+    result = oi.tortuosity(dataset, phase=1, direction="z")
+```
+
+## Next steps
+
+- {doc}`user-guide/concepts` — Understand tortuosity, effective diffusivity, and the mathematics
+- {doc}`user-guide/solvers` — Choose the right solver for your problem
+- {doc}`tutorials/index` — Interactive Colab notebooks
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 00000000..8c542460
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,76 @@
+OpenImpala Documentation
+========================
+
+**OpenImpala** is a high-performance framework for computing effective transport
+properties (diffusivity, conductivity, tortuosity) directly on 3D voxel images
+of porous microstructures.
+
+It solves steady-state transport equations on the voxel grid using finite
+differences, parallelised via MPI through the `AMReX <https://amrex-codes.github.io/amrex/>`_
+library, with `HYPRE <https://computing.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods>`_
+or AMReX MLMG for linear solves.
+
+.. code-block:: python
+
+   import numpy as np
+   import openimpala as oi
+
+   data = np.random.choice([0, 1], size=(64, 64, 64), dtype=np.int32)
+
+   with oi.Session():
+       result = oi.tortuosity(data, phase=1, direction="z")
+       print(f"Tortuosity: {result.tortuosity:.4f}")
+
+Install from PyPI
+-----------------
+
+.. code-block:: bash
+
+   # CPU version
+   pip install openimpala
+
+   # GPU version (NVIDIA CUDA)
+   pip install openimpala-cuda
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Getting Started
+
+   getting-started
+
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+
+   user-guide/concepts
+   user-guide/solvers
+   user-guide/input-files
+   user-guide/gpu
+   user-guide/hpc
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Tutorials
+
+   tutorials/index
+
+.. toctree::
+   :maxdepth: 2
+   :caption: API Reference
+
+   api/python
+   api/cpp
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Development
+
+   contributing
+   changelog
+
+
+Indices and tables
+------------------
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..07acf003
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,5 @@
+sphinx>=7.0
+furo
+breathe
+myst-parser
+sphinx-autodoc-typehints
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
new file mode 100644
index 00000000..1dfdc108
--- /dev/null
+++ b/docs/tutorials/index.md
@@ -0,0 +1,25 @@
+# Tutorials
+
+Interactive Jupyter notebooks that run directly in Google Colab — no local
+installation required.
+
+## Tutorial series
+
+| # | Topic | Colab |
+|---|-------|-------|
+| 01 | [Hello OpenImpala](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/01_hello_openimpala.ipynb) — Installation, first tortuosity calculation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/01_hello_openimpala.ipynb) |
+| 02 | [Digital Twin with PyBaMM](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/02_digital_twin.ipynb) — Battery electrode parameterisation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/02_digital_twin.ipynb) |
+| 03 | [REV & Uncertainty](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/03_rev_and_uncertainty.ipynb) — Representative volume element analysis | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/03_rev_and_uncertainty.ipynb) |
+| 04 | [Multi-Phase Transport](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/04_multi_phase_transport.ipynb) — Heterogeneous media with multiple phases | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/04_multi_phase_transport.ipynb) |
+| 05 | [Surrogate Modelling](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/05_surrogate_modelling.ipynb) — ML surrogates for transport properties | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/05_surrogate_modelling.ipynb) |
+| 06 | [Topology Optimisation](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/06_topology_optimisation.ipynb) — Microstructure design | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/06_topology_optimisation.ipynb) |
+| 07 | [HPC Scaling](https://github.com/BASE-Laboratory/OpenImpala/blob/master/tutorials/07_hpc_scaling.ipynb) — Performance analysis and scaling behaviour | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/tutorials/07_hpc_scaling.ipynb) |
+
+## Running locally
+
+All tutorials can also run locally if you have OpenImpala installed:
+
+```bash
+pip install openimpala jupyter matplotlib
+jupyter notebook tutorials/01_hello_openimpala.ipynb
+```
diff --git a/docs/user-guide/concepts.md b/docs/user-guide/concepts.md
new file mode 100644
index 00000000..9ad16478
--- /dev/null
+++ b/docs/user-guide/concepts.md
@@ -0,0 +1,77 @@
+# Concepts
+
+## What OpenImpala computes
+
+OpenImpala takes a **segmented 3D voxel image** (where each voxel is labelled
+with a phase ID) and computes **effective transport properties** by solving
+partial differential equations directly on the voxel grid.
+
+### Phase data
+
+Images are segmented into integer phase IDs stored in an AMReX `iMultiFab`.
+Typically:
+
+- **Phase 0** = pore / void
+- **Phase 1** = solid matrix
+
+This is configurable via the `phase` parameter. Multi-phase transport is
+supported: each phase can be assigned a different transport coefficient.
+
+### Volume fraction
+
+The simplest metric: the fraction of voxels belonging to a given phase.
+
+$$\varepsilon = \frac{N_{\text{phase}}}{N_{\text{total}}}$$
+
+### Percolation
+
+Before solving transport equations, OpenImpala checks whether the target phase
+forms a **connected path** from inlet to outlet using a GPU-accelerated
+flood-fill algorithm. If the phase does not percolate, transport is zero.
+
+### Tortuosity
+
+Tortuosity quantifies how much a winding pore structure impedes transport
+compared to a straight channel. OpenImpala solves the steady-state diffusion
+equation:
+
+$$\nabla \cdot (D \nabla \phi) = 0$$
+
+with Dirichlet boundary conditions at inlet ($\phi = 0$) and outlet
+($\phi = 1$), and zero-flux Neumann conditions on lateral faces.
+
+The effective diffusivity is computed from the resulting flux:
+
+$$D_{\text{eff}} = \frac{|\text{average flux}|}{\text{cross-section area} \times |\nabla\phi_{\text{imposed}}|}$$
+
+Tortuosity is then:
+
+$$\tau = \frac{\varepsilon_{\text{active}}}{D_{\text{eff}}}$$
+
+where $\varepsilon_{\text{active}}$ is the volume fraction of the percolating
+(connected) phase.
+
+For a uniform medium on an $N$-cell grid, the discrete solution gives
+$D_{\text{eff}} = N/(N-1)$, so $\tau = (N-1)/N$.
+
+### Effective diffusivity tensor
+
+For anisotropic microstructures, the full effective diffusivity tensor
+$\mathbf{D}_{\text{eff}}$ is computed by solving the **cell problem** from
+homogenisation theory:
+
+$$\nabla_\xi \cdot \left( D \nabla_\xi \chi_k \right) = -\nabla_\xi \cdot \left( D \hat{e}_k \right)$$
+
+for corrector functions $\chi_k$ in each direction $k \in \{x, y, z\}$, with
+periodic boundary conditions. The tensor components are:
+
+$$D_{\text{eff},ij} = \frac{1}{|Y|} \int_Y D(\mathbf{x}) \left( \delta_{ij} + \frac{\partial \chi_j}{\partial x_i} \right) \, d\mathbf{x}$$
+
+### Face coefficients
+
+Inter-cell diffusivities use the **harmonic mean** of adjacent cell values:
+
+$$D_{\text{face}} = \frac{2 D_L D_R}{D_L + D_R}$$
+
+This is physically correct for resistances in series and ensures that a solid
+cell ($D = 0$) adjacent to a pore cell correctly blocks transport.
diff --git a/docs/user-guide/gpu.md b/docs/user-guide/gpu.md
new file mode 100644
index 00000000..2935757f
--- /dev/null
+++ b/docs/user-guide/gpu.md
@@ -0,0 +1,51 @@
+# GPU Acceleration
+
+OpenImpala supports NVIDIA GPU acceleration via CUDA. All compute kernels,
+flood fills, and solver loops are GPU-compatible.
+
+## Installation
+
+```bash
+pip install openimpala-cuda
+```
+
+The GPU wheel requires:
+- NVIDIA GPU with compute capability 7.0+ (Volta or newer)
+- CUDA runtime libraries (typically provided by the NVIDIA driver)
+
+The `openimpala-cuda` package is a drop-in replacement for `openimpala` — the
+Python API is identical.
+
+## Usage
+
+No code changes are needed. The same Python scripts work on both CPU and GPU:
+
+```python
+import openimpala as oi
+import numpy as np
+
+data = np.random.choice([0, 1], size=(256, 256, 256), dtype=np.int32)
+
+with oi.Session():
+    result = oi.tortuosity(data, phase=1, direction="z")
+```
+
+When a GPU is available, AMReX automatically offloads `ParallelFor` kernels
+and HYPRE solver operations to the device.
+
+## What runs on GPU
+
+- Phase data lookup and coefficient field construction
+- Flood-fill percolation checks (atomic scatter-add)
+- HYPRE matrix assembly and linear solves
+- Solution extraction and flux integration
+- Through-thickness profile computation
+- Connected components labelling
+
+## Performance considerations
+
+- GPU acceleration provides the most benefit for large domains (>128^3)
+- For small problems, CPU may be faster due to kernel launch overhead
+- The MLMG solver currently runs on CPU only; use HYPRE solvers for GPU
+- Data transfer between host and device is minimised by keeping AMReX
+  data structures on the device throughout the solve
diff --git a/docs/user-guide/hpc.md b/docs/user-guide/hpc.md
new file mode 100644
index 00000000..6b355d73
--- /dev/null
+++ b/docs/user-guide/hpc.md
@@ -0,0 +1,77 @@
+# HPC Usage
+
+OpenImpala is designed for distributed-memory parallelism via MPI, making it
+suitable for large-scale simulations on HPC clusters.
+
+## Running with MPI
+
+### Python
+
+```bash
+# Install mpi4py
+pip install openimpala mpi4py
+
+# Run on 4 MPI ranks
+mpirun -np 4 python my_script.py
+```
+
+### C++ executable
+
+```bash
+mpirun -np 16 ./Diffusion3d inputs
+```
+
+### Apptainer on a cluster
+
+```bash
+mpirun -np 16 apptainer exec openimpala-v4.0.0.sif /opt/OpenImpala/build/Diffusion3d inputs
+```
+
+## SLURM batch script
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=openimpala
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=32
+#SBATCH --time=02:00:00
+#SBATCH --partition=compute
+
+module load mpi
+
+srun apptainer exec openimpala-v4.0.0.sif \
+    /opt/OpenImpala/build/Diffusion3d inputs
+```
+
+## Domain decomposition
+
+AMReX decomposes the 3D domain into boxes distributed across MPI ranks. The
+`max_grid_size` parameter controls the maximum box size:
+
+```ini
+amr.max_grid_size = 64
+```
+
+- **Smaller values** create more boxes, improving load balance across many ranks
+- **Larger values** reduce inter-rank communication but may cause load imbalance
+- Choose a power of 2 that evenly divides your domain dimensions
+
+## Scaling guidelines
+
+| Domain size | Recommended ranks | max_grid_size |
+|-------------|-------------------|---------------|
+| 128^3 | 1-4 | 64 |
+| 256^3 | 4-16 | 64 |
+| 512^3 | 16-64 | 64 |
+| 1024^3 | 64-256 | 128 |
+
+## Memory estimates
+
+Approximate memory per rank for a tortuosity solve:
+
+- Phase data: ~4 bytes/voxel (int32)
+- Solution field: ~8 bytes/voxel (float64)
+- HYPRE matrix: ~56 bytes/voxel (7-point stencil)
+- **Total: ~70 bytes/voxel**
+
+For a 512^3 domain on 64 ranks: ~140 MB per rank.
diff --git a/docs/user-guide/input-files.md b/docs/user-guide/input-files.md
new file mode 100644
index 00000000..08465d56
--- /dev/null
+++ b/docs/user-guide/input-files.md
@@ -0,0 +1,79 @@
+# Input Files
+
+When running OpenImpala from the command line (C++ executable), configuration
+is specified via AMReX `ParmParse` text files. The file is passed as the first
+argument:
+
+```bash
+./Diffusion3d inputs
+```
+
+## Example input file
+
+```ini
+# --- Image Input ---
+image.filename    = microstructure.tiff
+image.threshold   = 128
+
+# --- Solver Configuration ---
+tortuosity.direction      = 2          # 0=X, 1=Y, 2=Z
+tortuosity.phase_id       = 0          # Phase to solve for
+tortuosity.solver_type    = PCG        # PCG, FlexGMRES, GMRES, BiCGSTAB, SMG, PFMG
+
+# --- HYPRE Solver Parameters ---
+hypre.eps      = 1.0e-9                # Convergence tolerance
+hypre.maxiter  = 200                   # Maximum iterations
+
+# --- AMReX Grid Configuration ---
+amr.max_grid_size = 64                 # Box decomposition size
+
+# --- Output ---
+results.path = ./results               # Output directory
+tortuosity.write_plotfile = false       # Write AMReX plotfile of solution
+tortuosity.verbose = 1                 # 0=silent, 1=basic, 2+=detailed
+```
+
+## Key parameters
+
+### Image input
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `image.filename` | string | Path to 3D image (TIFF, HDF5, RAW, DAT) |
+| `image.threshold` | float | Binarisation threshold value |
+| `image.hdf5_dataset` | string | HDF5 dataset path (default: `/data`) |
+
+### Solver
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `tortuosity.direction` | int | 0 | Flow direction: 0=X, 1=Y, 2=Z |
+| `tortuosity.phase_id` | int | 0 | Phase ID of the conducting phase |
+| `tortuosity.solver_type` | string | PCG | HYPRE solver algorithm |
+| `tortuosity.vlo` | float | 0.0 | Dirichlet BC at inlet |
+| `tortuosity.vhi` | float | 1.0 | Dirichlet BC at outlet |
+
+### Multi-phase transport
+
+```ini
+tortuosity.active_phases      = 0 2       # Phase IDs with non-zero D
+tortuosity.phase_diffusivities = 1.0 0.5  # Corresponding D values
+```
+
+### Grid decomposition
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `amr.max_grid_size` | int | 32 | Maximum box size for domain decomposition |
+
+Smaller values create more boxes (better MPI load balance); larger values
+reduce communication overhead. Powers of 2 that divide the domain dimensions
+evenly are recommended.
+
+## Output files
+
+| File | Description |
+|------|-------------|
+| `results.json` | Structured JSON with all computed properties |
+| `results.txt` | Human-readable summary |
+| `plt_*/` | AMReX plotfile (if `write_plotfile = true`) |
diff --git a/docs/user-guide/solvers.md b/docs/user-guide/solvers.md
new file mode 100644
index 00000000..b8f7825a
--- /dev/null
+++ b/docs/user-guide/solvers.md
@@ -0,0 +1,71 @@
+# Solvers
+
+OpenImpala provides two solver backends for computing tortuosity, plus a
+legacy solver retained for comparison.
+
+## HYPRE solvers (default)
+
+The primary backend uses [HYPRE](https://computing.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods)
+structured-grid solvers. Available algorithms:
+
+| Solver | Type | Best for | Python name |
+|--------|------|----------|-------------|
+| **PCG** | Krylov (CG) | Single-phase diffusion (SPD systems) | `"pcg"` or `"auto"` |
+| FlexGMRES | Krylov | Multi-phase, non-symmetric problems | `"flexgmres"` |
+| GMRES | Krylov | General sparse systems | `"gmres"` |
+| BiCGSTAB | Krylov | Non-symmetric, when GMRES stalls | `"bicgstab"` |
+| SMG | Multigrid | Small grids, direct-like convergence | `"smg"` |
+| PFMG | Multigrid | Large grids, low memory | `"pfmg"` |
+
+**Default:** `"auto"` selects PCG, which is optimal for the single-phase
+steady-state diffusion problem (the Laplacian with harmonic-mean face
+coefficients is symmetric positive-definite).
+
+```python
+# Use the default (PCG)
+result = oi.tortuosity(data, phase=1, direction="z")
+
+# Explicitly choose a solver
+result = oi.tortuosity(data, phase=1, direction="z", solver="flexgmres")
+```
+
+## AMReX MLMG solver
+
+The matrix-free geometric multigrid solver uses AMReX's native
+`MLABecLaplacian` operator. Advantages:
+
+- **No matrix assembly** — the operator is applied matrix-free
+- **Lower memory** — approximately 3x less than HYPRE's `StructMatrix`
+- **Faster setup** — no algebraic multigrid (AMG) construction
+
+Best for small-to-medium grids on shared-memory systems.
+
+```python
+result = oi.tortuosity(data, phase=1, direction="z", solver="mlmg")
+```
+
+## When to use which
+
+| Scenario | Recommended solver |
+|----------|--------------------|
+| Quick desktop analysis (<256^3) | `"mlmg"` |
+| Single-phase, any size | `"auto"` (PCG) |
+| Multi-phase with varying D | `"flexgmres"` |
+| Large distributed MPI runs | `"pcg"` or `"pfmg"` |
+| Debugging / comparison | `"smg"` (most robust) |
+
+## Effective diffusivity tensor
+
+The `EffectiveDiffusivityHypre` solver uses the same HYPRE backends but solves
+the cell problem with periodic boundary conditions. This is accessed via the
+C++ API or the command-line interface, not yet exposed in the high-level
+Python facade.
+
+## Solver parameters
+
+When using the C++ interface or input files, solver behaviour is controlled via:
+
+```
+hypre.eps = 1.0e-9       # Convergence tolerance
+hypre.maxiter = 200      # Maximum iterations
+```
diff --git a/pyproject.toml b/pyproject.toml
index 12c734ca..20441548 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,13 @@ test = [
     "pytest>=7",
     "numpy",
 ]
+docs = [
+    "sphinx>=7.0",
+    "furo",
+    "breathe",
+    "myst-parser",
+    "sphinx-autodoc-typehints",
+]
 all = [
     "mpi4py",
     "tqdm",