nihermann · nihermann · Jul 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jul 3, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,58 @@
+name: Release
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: Build package
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Build artifacts
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install build twine
+          python -m build
+          python -m twine check dist/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: python-package
+          path: dist/*
+
+  publish:
+    name: Publish to PyPI
+    if: github.event_name == 'release'
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      contents: read
+      id-token: write
+
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: python-package
+          path: dist
+
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.gitignore b/.gitignore
@@ -1,9 +1,16 @@
 .idea
 .vscode
 __pycache__
-.DS_Store
-.mypy_cache/
-dino_models
 *.egg-info/
-dist
-venv
+*.pyc
+*.pyd
+*.so
+*.whl
+build/
+dist/
+.DS_Store
+
+.pytest_cache/
+
+puzzle_sim\lpips_models\alex.pth
+puzzle_sim\lpips_models\vgg.pth
diff --git a/README.md b/README.md
@@ -30,17 +30,62 @@ This repository contains the implementation of the cross-reference metric Puzzle
 - (29-11-2024) Official code release
 
 
-### Requirements
-If you simply want to use the metric use:
+### Installation
+
+PuzzleSim has one public entry point:
+
+```python
+from puzzle_sim import PuzzleSim
+```
+
+There is no separate CUDA package.
+
+#### PyTorch implementation
+
+Install from PyPI:
+
 ```shell
 pip install puzzle_sim
 ```
 
-If you want to extend it please install it locally as a package. The package requires Python 3.8 or higher. If you wish to use dinov3 backbones you must have Python 3.10 or higher and `transformers>=4.56`:
+This works on CPU-only machines and on machines with PyTorch CUDA installed. If your tensors are on CUDA, PuzzleSim uses PyTorch CUDA operations automatically.
+
+```python
+from puzzle_sim import PuzzleSim
+
+priors = priors.cuda()
+test_image = test_image.cuda()
+puzzle = PuzzleSim(reference=priors, net_type="squeeze")
+similarity_map = puzzle(test_image)
+```
+
+#### Faster CUDA implementation
+
+For the faster CUDA implementation, build PuzzleSim from source. You need:
+- a CUDA GPU,
+- a CUDA-enabled PyTorch installation,
+- the CUDA toolkit with `nvcc`,
+- a compatible C++ compiler.
+
+Install the CUDA build of PyTorch first, using the command for your platform from the
+[PyTorch installation guide](https://pytorch.org/get-started/locally/). Then build PuzzleSim:
+
 ```shell
-pip install -e .
+git clone https://github.com/nihermann/PuzzleSim.git
+cd PuzzleSim
+pip install --upgrade pip setuptools wheel setuptools-scm
+PUZZLE_SIM_BUILD_CUDA=1 pip install --no-build-isolation -v .
 ```
 
+`--no-build-isolation` is required because PyTorch's extension build uses the already-installed `torch` package. After installation, verify the compiled extension is available:
+
+```python
+import puzzle_sim
+print(puzzle_sim.get_cuda_version_info())
+```
+
+The PyPI wheel does not currently include prebuilt CUDA kernels. If you do not need the faster compiled kernels, use `pip install puzzle_sim`.
+
 
 ### Usage
 You can use the metric in your own code as follows:
@@ -64,7 +109,7 @@ puzzle = PuzzleSim(reference=priors, net_type='convnext_tiny')
 
 similarity_map = puzzle(test_image, layers=range(5), weights=None, reduction='mean')  # (H, W) similarity map in [0, 1]
 ```
-> If your GPU runs out of memory, try reducing the `stride` parameter in the forward call, this will reduce memory consumption. On the other hand, with small image dimensions the naive implementation might be faster although requiring much more memory (set `mem_save=False`).
+> If your GPU runs out of memory on the PyTorch fallback path, try reducing the `stride` parameter in the forward call, this will reduce memory consumption. On the integrated CUDA path, the packed matmul backend chunks the score matrix automatically.
 
 ### Demo
 Please find the demo in `demo.ipynb` to see how to run the metric on some example sets. In order to run the demo, you need to pull the data from another repository. Do this by either cloning the repository using

diff --git a/demo.ipynb b/demo.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,8 @@ maintainers = [
 description = "Implementation of PuzzleSim, a cross-refence image similarity metric designed for artifact detection in novel view synthesis methods."
 readme = "README.md"
 requires-python = ">=3.8"
-license = { file = "LICENSE.md" }
+license = "MIT"
+license-files = ["LICENSE.md"]
 keywords = ["puzzle similarity", "similarity", "no reference", "cross reference", "image metric"]
 classifiers = [
     "Development Status :: 5 - Production/Stable",
@@ -42,6 +43,9 @@ Issues = "https://github.com/nihermann/PuzzleSim/issues"
 where = ["src"]
 include = ["puzzle_sim*"]
 
+[tool.setuptools.package-data]
+puzzle_sim = ["py.typed", "cuda_ext/*.cpp", "cuda_ext/*.cu"]
+
 [tool.setuptools.dynamic]
 dependencies = { file = ["requirements.txt"] }
 optional-dependencies = { dev = { file = ["requirements-dev.txt"] } }

diff --git a/setup.py b/setup.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from setuptools import setup
+
+
+def _truthy_env(name: str) -> bool:
+    return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _detected_arch_list() -> list[str]:
+    env = os.environ.get("TORCH_CUDA_ARCH_LIST")
+    if env:
+        return [item.strip() for item in env.split(";") if item.strip()]
+
+    import torch
+
+    if torch.cuda.is_available():
+        major, minor = torch.cuda.get_device_capability()
+        return [f"{major}.{minor}"]
+    return ["7.0", "7.5", "8.0", "8.6", "8.9", "9.0"]
+
+
+def _cuda_build_kwargs() -> dict[str, Any]:
+    if not _truthy_env("PUZZLE_SIM_BUILD_CUDA"):
+        return {}
+
+    try:
+        import torch  # noqa: F401
+        from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+    except ImportError as exc:  # pragma: no cover - build-environment dependent
+        raise RuntimeError(
+            "PUZZLE_SIM_BUILD_CUDA=1 requires torch to be installed in the build environment. "
+            "Use pip build isolation off for CUDA wheel builds."
+        ) from exc
+
+    os.environ.setdefault("TORCH_CUDA_ARCH_LIST", ";".join(_detected_arch_list()))
+
+    ext_modules = [
+        CUDAExtension(
+            name="puzzle_sim._cuda_ext",
+            sources=[
+                "src/puzzle_sim/cuda_ext/puzzle_sim_cuda.cpp",
+                "src/puzzle_sim/cuda_ext/kernels.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["/O2"] if os.name == "nt" else ["-O3"],
+                "nvcc": [
+                    "-O3",
+                    "--use_fast_math",
+                    "--expt-relaxed-constexpr",
+                    "-allow-unsupported-compiler",
+                ],
+            },
+        )
+    ]
+    return {
+        "ext_modules": ext_modules,
+        "cmdclass": {"build_ext": BuildExtension},
+        "zip_safe": False,
+    }
+
+
+setup(**_cuda_build_kwargs())