From 0a675a8a4bb3f0bf4c0699d8835a04229f0a767c Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Mon, 4 May 2026 13:31:00 -0500 Subject: [PATCH 01/15] ENH: Add v4 remote-module ingestion (two-phase, per-commit hooks) Splits the v3 single-driver flow into two independent phases so the upstream-archival publish step never runs on a working tree that filter-repo has rewritten: - Phase A (ingest-module-v4.sh): local, throwaway clone; rewrites history into Modules//; never touches the upstream remote. - Phase B (archive-remote-module.sh): fresh shallow clone; pushes one removal commit + promotes MIGRATION_README.md to README.md; flips archived=true. Gated on the Phase A PR being merged. The shared sanitize-history.py walks the rewritten branch and applies the same hooks ITK's .pre-commit-config.yaml enforces: - clang-format on every C/C++ blob (content sniff) - black on every Python blob (content sniff) - gersemi on every CMake blob using ITK's .gersemi.config - trailing-whitespace, end-of-file-fixer, mixed-line-ending on every text-classified blob - VTK / SVG / hash-content sidecar files are skipped intact - Every commit subject without an ITK prefix gets one heuristically auto-prepended; decisions are logged for operator audit filter-repo's blob_callback is filename-blind by design, so excludes are approximated via content sniffing. IOMeshSTL whitelist is the first concrete example; smoke-tested via --dry-run successfully (143->89 commits, 44->24 merges preserved, 108/149 blobs reformatted). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../INGESTION_STRATEGY_v4.md | 257 +++++++++ .../archive-remote-module.sh | 315 ++++++++++ .../RemoteModuleIngest/ingest-module-v4.sh | 372 ++++++++++++ .../RemoteModuleIngest/sanitize-history.py | 539 ++++++++++++++++++ .../whitelists/IOMeshSTL.list | 25 + 5 files changed, 1508 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md create mode 100755 Utilities/Maintenance/RemoteModuleIngest/archive-remote-module.sh create mode 100755 Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh create mode 100755 Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py create mode 100644 Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshSTL.list diff --git a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md new file mode 100644 index 00000000000..bf53f8eeb0a --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md @@ -0,0 +1,257 @@ +# Remote Module Ingestion Strategy — v4 (two-phase split) + +This is the next iteration of the ingestion process. v3 (see +`INGESTION_STRATEGY.md`) folded the upstream-archival step into the +ingestion driver, which made the ingest script harder to re-run and +coupled an irreversible upstream operation (publishing a deletion +commit + flipping `archived=true`) to a reversible local one (rewriting +history into ITK). + +**v4 splits these into two independent phases**, with separate scripts +and separate operator gates between them. + +## TL;DR + +| Phase | Script | Local-clone fate | Upstream side-effects | +|---|---|---|---| +| **A — Ingestion** | `ingest-module-v4.sh` | **Throwaway** — destroy after PR merges | None (no pushes to upstream remote) | +| **B — Upstream archival** | `archive-remote-module.sh` | **Fresh clone** — built from scratch | Pushes one removal commit + promotes `MIGRATION_README.md` to `README.md` + flips `archived=true` | + +The phases share **only one persistent artifact** between them: the +**whitelist** that defines what files migrated to ITK vs. what files +remain in the upstream archive. Every other piece of state is +reproducible. + +## Why two phases + +| v3 problem | v4 solution | +|---|---| +| Ingest script orchestrated upstream archival as a bonus step → re-running the ingest after a CI failure risked re-touching upstream | Phase A never touches the upstream remote at all; failures are 100% local | +| Per-commit pre-commit + ghostflow validity was best-effort; some historical commits failed style hooks during reviewer testing | Phase A applies clang-format + black + commit-prefix sanitization to **every** commit, so every commit independently passes ITK's gates | +| Upstream archival was a manual checklist (per `feedback_ingest_archive_readme.md`) → easy to forget steps, ordering errors | Phase B is a single script with deterministic ordering and post-conditions verifiable via `gh api .../readme` | +| `git blame` on imported history showed mixed formatting from various upstream eras → noise on subsequent maintenance work | Phase A formats every blob across history with the destination ITK's `.clang-format` so blame is uniform | + +## Phase A — Ingestion (local, throwaway) + +### Operator command + +```bash +ingest-module-v4.sh [--dry-run] [--keep-tempdir] +``` + +Examples: + +```bash +ingest-module-v4.sh IOMeshSTL IO # Modules/IO/MeshSTL/ +ingest-module-v4.sh Cuberille Filtering # Modules/Filtering/Cuberille/ +``` + +### What Phase A does + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 1. Pre-flight │ +│ - git-filter-repo, clang-format, black available │ +│ - clean ITK working tree on a feature branch │ +│ - whitelist file readable │ +│ - destination directory does not collide │ +├─────────────────────────────────────────────────────────────────┤ +│ 2. Mirror-clone upstream into ${TMPDIR}/-mirror │ +│ - throwaway location; deleted on success unless --keep- │ +├─────────────────────────────────────────────────────────────────┤ +│ 3. filter-repo: paths-from-file , --tag-rename '' │ +│ - keep only whitelisted files across full history │ +│ - non-whitelisted files vanish from every commit │ +├─────────────────────────────────────────────────────────────────┤ +│ 4. filter-repo: subdirectory move into Modules// │ +├─────────────────────────────────────────────────────────────────┤ +│ 5. sanitize-history.py: │ +│ - For every commit: │ +│ a. Walk file_changes, rewrite each .cxx/.h/.hxx/... blob │ +│ through `clang-format -style=file` (ITK's .clang-format)│ +│ b. Walk file_changes, rewrite each .py blob through `black`│ +│ c. Examine commit subject; if no BUG:/COMP:/DOC:/ENH:/ │ +│ PERF:/STYLE: prefix → auto-prepend a heuristic prefix │ +│ d. Log every prefix decision to │ +│ ${LOG_DIR}/commit-prefix-log.txt │ +├─────────────────────────────────────────────────────────────────┤ +│ 6. Topology verification │ +│ - require ≥1 merge in upstream/main..HEAD if upstream had any│ +│ - reject linearization (per feedback_ingest_merge_topology) │ +├─────────────────────────────────────────────────────────────────┤ +│ 7. Mode-A merge into ITK feature branch │ +│ - --no-ff --allow-unrelated-histories │ +│ - merge commit subject: "ENH: Ingest remote module" │ +├─────────────────────────────────────────────────────────────────┤ +│ 8. Verification │ +│ - pre-commit run --all-files exit 0 │ +│ - every commit on the ingest tail has a valid prefix │ +│ - git blame on a sampled file walks back into upstream │ +└─────────────────────────────────────────────────────────────────┘ +``` + +After Phase A succeeds, the operator opens a draft PR (per +`pr-no-unsolicited` + `pr-always-draft`). The temp dir is destroyed +unless `--keep-tempdir` was passed. **Phase A makes no commits or +pushes to the upstream remote — full stop.** + +## Phase B — Upstream archival (clean clone, irreversible publish) + +Run **only after the Phase A PR has merged into ITK `main`.** This is +the gate: until ITK has accepted the migration, the upstream archive is +not authorized. + +### Operator command + +```bash +archive-remote-module.sh [--no-archive-flip] [--dry-run] +``` + +### What Phase B does + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 1. Pre-flight │ +│ - The Phase A ingest PR has merged on ITK main │ +│ - git, gh CLI authenticated, network reachable │ +│ - whitelist file (same one used in Phase A) accessible │ +├─────────────────────────────────────────────────────────────────┤ +│ 2. Clean shallow clone of upstream │ +│ git clone --depth 50 git@github.com:/.git ${TMP} │ +├─────────────────────────────────────────────────────────────────┤ +│ 3. Build the removal commit on a fresh branch │ +│ - git rm -rf each whitelisted path │ +│ - leave non-whitelisted (papers, Old/, examples/, etc.) │ +│ - one commit subject: "MIGRATE: Remove files migrated to ITK │ +│ " │ +├─────────────────────────────────────────────────────────────────┤ +│ 4. Promote MIGRATION_README.md to README.md │ +│ - per feedback_ingest_archive_readme.md │ +│ - rename old README.rst → info.rst (history preserved) │ +│ - rename MIGRATION_README.md → README.md │ +│ - one additional commit: │ +│ "DOC: Promote migration notice to README.md" │ +├─────────────────────────────────────────────────────────────────┤ +│ 5. Push to upstream main │ +│ git push origin HEAD:main │ +├─────────────────────────────────────────────────────────────────┤ +│ 6. Verify GitHub-rendered README is the migration notice │ +│ gh api repos///readme --jq .content | base64 -d │ +│ | head -10 must contain "Migrated to ITK main" or equivalent │ +├─────────────────────────────────────────────────────────────────┤ +│ 7. Flip archived=true (skip if --no-archive-flip) │ +│ gh api -X PATCH repos// -F archived=true │ +├─────────────────────────────────────────────────────────────────┤ +│ 8. Cleanup │ +│ - delete the throwaway clone │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Why Phase B uses a fresh clone + +The Phase A throwaway clone has been mutated by `filter-repo` and is +**not safe to push back** to the upstream remote — its commit graph and +object pool are no longer 1:1 with the upstream. A fresh clone is the +only way to safely publish a deletion commit on top of upstream's actual +HEAD without inadvertently force-pushing rewritten history to the +upstream archive (which would destroy the original development history +that the archive is supposed to preserve). + +## Shared artifact: the whitelist + +Both phases consume the same whitelist. It lives at: + +``` +Utilities/Maintenance/RemoteModuleIngest/whitelists/.list +``` + +Format: one path/glob per line, relative to the upstream repo root, +shell-style globs. Lines starting with `#` are comments. Example for +`IOMeshSTL`: + +``` +# Public headers +include/** + +# Test sources + fixtures +test/** + +# CMake / module metadata +CMakeLists.txt +itk-module.cmake + +# Wrapping +wrapping/** + +# License (always) +LICENSE +``` + +Phase A uses this with `--paths-from-file`; Phase B inverts it (delete +everything on the whitelist; keep everything else, plus +`MIGRATION_README.md` if present). + +## Implementation choices (from 2026-05-04 design discussion) + +| Question | Decision | +|---|---| +| Where does the upstream "removal" commit live? | **Phase B publishes it to upstream `main` directly** (replaces the manual archival PR pattern from `feedback_ingest_archive_readme.md`) | +| Auto-prefix vs. report-and-fail for commit subjects? | **Heuristic auto-prefix with sidecar log** — fast, operator can spot-check the log before merge | +| Format-history scope? | **All commits** — uniform `git blame`, one-time wall-time cost is acceptable | +| Python formatter? | **`black`** on every `.py` blob across all history (in addition to clang-format on C++) | +| Where do v3 scripts go? | Kept in tree, renamed `*_v3.{sh,py}` until v4 has been used for two successful ingests, then retired to `Utilities/Maintenance/RemoteModuleIngest/retired/v3/` | + +## Operator gate between Phase A and Phase B + +``` +Phase A: ingest-module-v4.sh IOMeshSTL IO + → draft PR opened + → reviewer feedback, fixups, force-pushes (Phase A re-runs OK) + → PR marked ready for review + → MAINTAINER MERGES THE PR + ──────────────────────────────────────── + → Phase B: archive-remote-module.sh IOMeshSTL + → upstream main now shows MIGRATION_README.md as repo README + → upstream archived=true +``` + +This gate is the central correctness property of v4: **upstream is +never touched until ITK has accepted the migration.** + +## Heuristic prefix mapping + +When `sanitize-history.py` encounters a commit subject without a valid +prefix, it auto-prepends one based on the table below. Every decision +is logged with ` ` so the +operator can audit before merge. + +| Subject contains... | Prefix | +|---|---| +| `fix`, `bug`, `crash`, `segfault`, `leak`, `regression` | `BUG:` | +| `cmake`, `compil`, `warning`, `build`, ` ci ` / `ci:`, `clang-format` config | `COMP:` | +| `doc`, `readme`, `comment`, `license`, `header text` | `DOC:` | +| `style`, `whitespace`, `format`, `rename`, `lint`, `clean up` | `STYLE:` | +| `perf`, `optimi`, `speed`, `faster`, `efficien` | `PERF:` | +| anything else | `ENH:` (default) | + +These rules are intentionally **conservative** — when in doubt, default +to `ENH:`. Misclassifications get caught by the operator reading the +sidecar log; the cost of `ENH: STYLE-cleanup-of-headers` slipping past +review is much lower than the cost of `STYLE: Add new IO format +support` mis-prefixed. + +## What this strategy explicitly does NOT change from v3 + +- The Mode-A (`--no-ff --allow-unrelated-histories`) merge into ITK +- Merge-topology preservation requirement (per + `feedback_ingest_merge_topology.md`) +- The Mode-A subject convention for the merge commit (`ENH: Ingest …`) +- The whitelist as the source of truth for what migrates +- The directory layout (`Modules///`) + +## Provenance + +- 2026-05-04 — Initial v4 design (Hans Johnson + Claude Code session) in + response to coupled-archival pain in v3 ingests (LabelErodeDilate, + GLI, MGHIO, FastBilateral, MeshNoise, Montage) +- Smoke-tested on `IOMeshSTL` before being applied to any other module diff --git a/Utilities/Maintenance/RemoteModuleIngest/archive-remote-module.sh b/Utilities/Maintenance/RemoteModuleIngest/archive-remote-module.sh new file mode 100755 index 00000000000..7055eb9de1a --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/archive-remote-module.sh @@ -0,0 +1,315 @@ +#!/usr/bin/env bash +# archive-remote-module.sh — Phase B driver for v4 ingestion +# +# Phase B is the IRREVERSIBLE, UPSTREAM-PUBLISH half. It runs ONLY +# AFTER the Phase A ingest PR has merged into ITK main. Phase B: +# +# 1. Clones the upstream remote module fresh (NEVER reuses Phase A's +# filter-repo'd clone — that history is rewritten and unsafe to +# push back). +# 2. Removes the whitelisted files (the ones now living in ITK). +# 3. Promotes MIGRATION_README.md to README.md so the GitHub-rendered +# landing page shows the migration notice (per +# feedback_ingest_archive_readme.md). +# 4. Pushes the resulting two commits to upstream main. +# 5. Verifies via gh api that the rendered README contains the +# migration notice. +# 6. Flips archived=true on the GitHub repo. +# +# See INGESTION_STRATEGY_v4.md for the full design. +# +# Usage: +# archive-remote-module.sh [options] +# +# Options: +# --upstream-url URL Override the GIT_REPOSITORY parsed from +# Modules/Remote/.remote.cmake. +# --whitelist FILE Override the default whitelist location +# (whitelists/.list, this directory). +# --itk-pr-number N The merged Phase A PR number, included in +# the upstream removal-commit message. +# --no-archive-flip Push the deletion + README commits, but do +# NOT flip archived=true. Use when the +# upstream still needs hand-tweaks. +# --dry-run Build the commits in a tempdir and show +# what would be pushed; do not push. +# --keep-tempdir Don't delete the throwaway clone. +# -h, --help Show this help. +# +# Exit codes: +# 0 — success +# 1 — argument or environment failure +# 2 — clone, commit, or push failure +# 3 — README verification failed (rendered README is not the migration notice) + +set -euo pipefail + +# -------------------------------------------------------------------- +# Helpers +# -------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +info() { printf '==> %s\n' "$*"; } +warn() { printf 'WARN: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } + +show_help() { + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 +} + +# -------------------------------------------------------------------- +# Argument parsing +# -------------------------------------------------------------------- +MODULE="" +UPSTREAM_URL="" +WHITELIST="" +ITK_PR_NUMBER="" +NO_ARCHIVE_FLIP=false +DRY_RUN=false +KEEP_TEMPDIR=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_help ;; + --upstream-url) UPSTREAM_URL="$2"; shift 2 ;; + --whitelist) WHITELIST="$2"; shift 2 ;; + --itk-pr-number) ITK_PR_NUMBER="$2"; shift 2 ;; + --no-archive-flip) NO_ARCHIVE_FLIP=true; shift ;; + --dry-run) DRY_RUN=true; shift ;; + --keep-tempdir) KEEP_TEMPDIR=true; shift ;; + -*) die "Unknown option: $1" ;; + *) + [[ -z "$MODULE" ]] && MODULE="$1" || die "Unexpected positional argument: $1" + shift + ;; + esac +done + +[[ -n "$MODULE" ]] || die "Module name required (e.g., IOMeshSTL)" + +# -------------------------------------------------------------------- +# Preflight +# -------------------------------------------------------------------- +command -v gh >/dev/null || die "gh CLI required" +command -v git >/dev/null || die "git required" + +# Resolve whitelist +if [[ -z "$WHITELIST" ]]; then + WHITELIST="$SCRIPT_DIR/whitelists/$MODULE.list" +fi +[[ -r "$WHITELIST" ]] || die "Whitelist not readable: $WHITELIST" + +# Try to infer upstream URL from the now-removed remote.cmake by looking +# back in git history of an ITK checkout, if available. +if [[ -z "$UPSTREAM_URL" ]]; then + ITK_SRC="$(git rev-parse --show-toplevel 2>/dev/null || true)" + if [[ -n "$ITK_SRC" ]]; then + REMOTE_FILE="$ITK_SRC/Modules/Remote/$MODULE.remote.cmake" + if [[ -f "$REMOTE_FILE" ]]; then + UPSTREAM_URL=$(awk '/GIT_REPOSITORY/ { print $2 }' "$REMOTE_FILE" | head -1) + else + # remote.cmake already deleted by Phase A — try the file's last + # version before deletion. + UPSTREAM_URL=$( + git -C "$ITK_SRC" log --all --pretty=format: --diff-filter=D \ + -p -- "Modules/Remote/$MODULE.remote.cmake" 2>/dev/null \ + | awk '/^-\s*GIT_REPOSITORY/ { print $2 }' \ + | head -1 + ) || true + fi + fi +fi +[[ -n "$UPSTREAM_URL" ]] \ + || die "Could not infer upstream URL; pass --upstream-url explicitly." + +# Parse owner/repo for gh-api calls +OWNER_REPO=$(echo "$UPSTREAM_URL" \ + | sed -E 's#^(git@github.com:|https://github.com/)##; s#\.git$##') +[[ "$OWNER_REPO" =~ ^[^/]+/[^/]+$ ]] \ + || die "Could not parse owner/repo from $UPSTREAM_URL (got: $OWNER_REPO)" + +info "Phase B — Upstream archival (fresh clone, irreversible publish)" +info " Module: $MODULE" +info " Upstream URL: $UPSTREAM_URL" +info " Owner/repo: $OWNER_REPO" +info " Whitelist: $WHITELIST" +[[ -n "$ITK_PR_NUMBER" ]] && info " ITK PR ref: #$ITK_PR_NUMBER" +$DRY_RUN && info " Mode: --dry-run (no push, no archive flip)" +$NO_ARCHIVE_FLIP && info " --no-archive-flip: will push but not archive" + +# -------------------------------------------------------------------- +# Operator gate: confirm the Phase A PR has merged. +# -------------------------------------------------------------------- +if [[ -n "$ITK_PR_NUMBER" ]] && ! $DRY_RUN; then + PR_STATE=$(gh pr view "$ITK_PR_NUMBER" \ + --repo InsightSoftwareConsortium/ITK \ + --json state,mergedAt --jq '"\(.state) merged=\(.mergedAt // "no")"' \ + 2>/dev/null) || PR_STATE="(could not query)" + info " ITK PR #$ITK_PR_NUMBER state: $PR_STATE" + if [[ "$PR_STATE" != *"MERGED"* ]]; then + die "ITK PR #$ITK_PR_NUMBER does not appear merged. Phase B requires +the Phase A PR to be merged on ITK main first. Aborting." + fi +fi + +# -------------------------------------------------------------------- +# Work area +# -------------------------------------------------------------------- +WORKDIR=$(mktemp -d "${TMPDIR:-/tmp}/archive-v4-$MODULE.XXXXXX") +cleanup() { + if $KEEP_TEMPDIR; then + info "Tempdir preserved at: $WORKDIR" + else + rm -rf "$WORKDIR" + fi +} +trap cleanup EXIT + +# -------------------------------------------------------------------- +# Step 1: shallow clone of upstream (fresh — never the Phase A clone) +# -------------------------------------------------------------------- +CLONE="$WORKDIR/$MODULE" +info "Cloning upstream into $CLONE (depth 50, fresh — not Phase A's clone)..." +git clone --quiet --depth 50 "$UPSTREAM_URL" "$CLONE" +DEFAULT_BRANCH=$(git -C "$CLONE" symbolic-ref --short HEAD 2>/dev/null || echo main) +info " Default branch: $DEFAULT_BRANCH" + +# -------------------------------------------------------------------- +# Step 2: build the removal commit. Walk the whitelist line-by-line, +# `git rm -rf` each path that exists. Commit with a clear subject. +# -------------------------------------------------------------------- +info "Removing whitelisted (now-migrated) files..." +REMOVED_COUNT=0 +( + cd "$CLONE" + while IFS= read -r line; do + line="${line%%#*}" # strip comment + line="${line//[$'\t\r ']/}" # strip whitespace (paths shouldn't contain spaces) + [[ -z "$line" ]] && continue + # Globs are evaluated relative to the repo root. + matches=( $(eval "ls -d $line 2>/dev/null" || true) ) + if (( ${#matches[@]} > 0 )); then + git rm -rf "${matches[@]}" >/dev/null 2>&1 || true + REMOVED_COUNT=$(( REMOVED_COUNT + ${#matches[@]} )) + fi + done < "$WHITELIST" +) + +# Commit only if anything was removed +if (( REMOVED_COUNT == 0 )); then + warn "No whitelisted paths matched in upstream; nothing to remove." + warn "Skipping removal commit — proceeding to README promotion." +else + COMMIT_MSG_RM="MIGRATE: Remove files migrated to ITK main" + if [[ -n "$ITK_PR_NUMBER" ]]; then + COMMIT_MSG_RM="$COMMIT_MSG_RM (ITK PR #$ITK_PR_NUMBER)" + fi + ( + cd "$CLONE" + git -c user.name="ITK Migration Bot" \ + -c user.email="itk-migration@itk.org" \ + commit -m "$COMMIT_MSG_RM" \ + -m "These files have been ingested into the ITK main repository. +This repository is being archived; see README for details." + ) || die "Removal commit failed" + info " Removed $REMOVED_COUNT path(s); commit: $COMMIT_MSG_RM" +fi + +# -------------------------------------------------------------------- +# Step 3: promote MIGRATION_README.md to README.md so the rendered +# landing page on GitHub shows the migration notice +# (per feedback_ingest_archive_readme.md). +# +# Cases handled: +# (a) MIGRATION_README.md exists, README.rst (or other non-.md) exists +# → mv README.rst -> info.rst ; mv MIGRATION_README.md -> README.md +# (b) MIGRATION_README.md exists, README.md exists (already .md) +# → mv MIGRATION_README.md README.md (overwrites the old README; +# keeping the original around as info.md preserves its history) +# (c) MIGRATION_README.md does not exist +# → warn — operator must add it manually before re-running. +# -------------------------------------------------------------------- +PROMOTED=false +( + cd "$CLONE" + if [[ ! -f MIGRATION_README.md ]]; then + warn "No MIGRATION_README.md in upstream root." + warn "Add one (e.g., copying the LabelErodeDilate template) and re-run." + exit 0 + fi + if [[ -f README.rst ]]; then + git mv README.rst info.rst + elif [[ -f README.md ]]; then + git mv README.md info.md + fi + git mv MIGRATION_README.md README.md + + COMMIT_MSG_DOC="DOC: Promote migration notice to README.md" + git -c user.name="ITK Migration Bot" \ + -c user.email="itk-migration@itk.org" \ + commit -m "$COMMIT_MSG_DOC" \ + -m "GitHub renders README.md (or README.rst) on the landing page, +not MIGRATION_README.md. Promoting the migration notice to README.md +ensures visitors to the archived repo see the redirect." +) +if [[ -f "$CLONE/README.md" ]] && grep -qi "migrat" "$CLONE/README.md" 2>/dev/null; then + PROMOTED=true +fi + +# -------------------------------------------------------------------- +# Step 4: dry-run stops here +# -------------------------------------------------------------------- +if $DRY_RUN; then + info "" + info "Dry-run complete. Commits prepared in $CLONE." + info "Inspect with:" + info " cd $CLONE && git log --oneline | head -5" + exit 0 +fi + +# -------------------------------------------------------------------- +# Step 5: push to upstream main (the irreversible step) +# -------------------------------------------------------------------- +info "Pushing to upstream $DEFAULT_BRANCH..." +( + cd "$CLONE" + git push origin "HEAD:$DEFAULT_BRANCH" +) || die "Push to upstream failed. Check authentication and write access." + +# -------------------------------------------------------------------- +# Step 6: verify GitHub renders the migration notice +# -------------------------------------------------------------------- +info "Verifying GitHub-rendered README is the migration notice..." +RENDERED=$(gh api "repos/$OWNER_REPO/readme" --jq '.content' 2>/dev/null \ + | base64 -d 2>/dev/null | head -20 || true) +if [[ -z "$RENDERED" ]]; then + warn "Could not fetch rendered README via gh api." + warn "Manual check needed: https://github.com/$OWNER_REPO" +elif ! echo "$RENDERED" | grep -qi "migrat\|archive\|moved to ITK"; then + warn "Rendered README does not appear to be the migration notice." + warn "First lines:" + printf '%s\n' "$RENDERED" | sed 's/^/ /' >&2 + die "README precedence verification failed (per feedback_ingest_archive_readme.md)." +else + info " OK: rendered README contains migration language." +fi + +# -------------------------------------------------------------------- +# Step 7: flip archived=true (skip if --no-archive-flip) +# -------------------------------------------------------------------- +if $NO_ARCHIVE_FLIP; then + info "Skipping archived=true flip per --no-archive-flip." + info "When ready, run:" + info " gh api -X PATCH repos/$OWNER_REPO -F archived=true" +else + info "Flipping archived=true on $OWNER_REPO..." + gh api -X PATCH "repos/$OWNER_REPO" -F archived=true >/dev/null \ + || die "Failed to flip archived=true. Check permissions on $OWNER_REPO." + info " OK: $OWNER_REPO is now archived." +fi + +info "" +info "Phase B complete." +info " Upstream removal commit pushed to $UPSTREAM_URL" +info " README precedence: rendered README is the migration notice" +$NO_ARCHIVE_FLIP || info " Repo state: archived" diff --git a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh new file mode 100755 index 00000000000..baf3489134b --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh @@ -0,0 +1,372 @@ +#!/usr/bin/env bash +# ingest-module-v4.sh — Phase A driver for v4 remote-module ingestion +# +# Phase A is the LOCAL, REVERSIBLE half of the ingestion process. It +# rewrites the upstream remote-module's history into a form suitable +# for a Mode-A merge into ITK proper. Phase A makes NO commits or +# pushes to the upstream remote — the local clone is throwaway. The +# subsequent upstream-archival publish step lives in Phase B +# (archive-remote-module.sh) and is gated on the Phase A PR being +# merged into ITK main. +# +# See INGESTION_STRATEGY_v4.md for the full design. +# +# Usage: +# ingest-module-v4.sh [options] +# +# Options: +# --upstream-url URL Override the GIT_REPOSITORY parsed from +# Modules/Remote/.remote.cmake. +# --whitelist FILE Override the default whitelist location +# (whitelists/.list, this directory). +# --dry-run Run filter-repo + sanitize passes but do +# NOT merge into ITK. Reports what would +# land. +# --keep-tempdir Don't delete the rewritten clone after +# finishing (useful with --dry-run for +# post-mortem inspection). +# --skip-format Skip sanitize-history.py. ONLY for debugging +# the filter-repo passes; the resulting branch +# will not pass ITK's pre-commit gate. +# -h, --help Show this help. +# +# Exit codes: +# 0 — success +# 1 — argument or environment failure +# 2 — filter-repo, sanitize, or merge failure +# 3 — post-merge verification failed (pre-commit, prefix scan) + +set -euo pipefail + +# -------------------------------------------------------------------- +# Helpers +# -------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +info() { printf '==> %s\n' "$*"; } +warn() { printf 'WARN: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } + +show_help() { + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 +} + +# -------------------------------------------------------------------- +# Argument parsing +# -------------------------------------------------------------------- +MODULE="" +DEST_GROUP="" +UPSTREAM_URL="" +WHITELIST="" +DRY_RUN=false +KEEP_TEMPDIR=false +SKIP_FORMAT=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_help ;; + --upstream-url) UPSTREAM_URL="$2"; shift 2 ;; + --whitelist) WHITELIST="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + --keep-tempdir) KEEP_TEMPDIR=true; shift ;; + --skip-format) SKIP_FORMAT=true; shift ;; + -*) die "Unknown option: $1" ;; + *) + if [[ -z "$MODULE" ]]; then MODULE="$1" + elif [[ -z "$DEST_GROUP" ]]; then DEST_GROUP="$1" + else die "Unexpected positional argument: $1" + fi + shift + ;; + esac +done + +[[ -n "$MODULE" ]] || die "Module name required (e.g., IOMeshSTL)" +[[ -n "$DEST_GROUP" ]] || die "Destination group required (e.g., IO, Filtering, Segmentation)" + +# -------------------------------------------------------------------- +# Preflight +# -------------------------------------------------------------------- +command -v git-filter-repo >/dev/null \ + || die "git-filter-repo required. Install with: pixi global install git-filter-repo" + +if ! $SKIP_FORMAT; then + command -v clang-format >/dev/null \ + || die "clang-format required for sanitize step (or pass --skip-format)" + command -v black >/dev/null \ + || die "black required for sanitize step (or pass --skip-format)" + command -v gersemi >/dev/null \ + || die "gersemi required for sanitize step (or pass --skip-format)" +fi + +ITK_SRC="$(git rev-parse --show-toplevel 2>/dev/null || true)" +[[ -n "$ITK_SRC" ]] || die "Must be run from inside a git checkout of ITK" +[[ -f "$ITK_SRC/itk-module.cmake" || -f "$ITK_SRC/CMakeLists.txt" ]] \ + || die "Current git root does not look like ITK: $ITK_SRC" + +if ! $DRY_RUN && [[ -n "$(git -C "$ITK_SRC" status --porcelain)" ]]; then + die "Working tree not clean; commit or stash first (or use --dry-run)" +fi + +# Resolve whitelist +if [[ -z "$WHITELIST" ]]; then + WHITELIST="$SCRIPT_DIR/whitelists/$MODULE.list" +fi +[[ -r "$WHITELIST" ]] || die "Whitelist not readable: $WHITELIST + Create it from a sibling module's whitelist as a starting point." + +# Resolve clang-format / gersemi config files from the destination ITK tree +CLANG_FORMAT_STYLE="$ITK_SRC/.clang-format" +[[ -f "$CLANG_FORMAT_STYLE" ]] \ + || die "ITK .clang-format missing at $CLANG_FORMAT_STYLE" +GERSEMI_CONFIG="$ITK_SRC/.gersemi.config" +[[ -f "$GERSEMI_CONFIG" ]] \ + || warn "ITK .gersemi.config missing at $GERSEMI_CONFIG; gersemi will use defaults" + +# Infer upstream URL from the .remote.cmake if not provided +if [[ -z "$UPSTREAM_URL" ]]; then + REMOTE_FILE="$ITK_SRC/Modules/Remote/$MODULE.remote.cmake" + [[ -f "$REMOTE_FILE" ]] \ + || die "$REMOTE_FILE not found; pass --upstream-url explicitly" + UPSTREAM_URL=$(awk '/GIT_REPOSITORY/ { print $2 }' "$REMOTE_FILE" | head -1) + [[ -n "$UPSTREAM_URL" ]] || die "Could not parse GIT_REPOSITORY from $REMOTE_FILE" +fi + +# Destination collision check +DEST_DIR="$ITK_SRC/Modules/$DEST_GROUP/$MODULE" +if ! $DRY_RUN && [[ -e "$DEST_DIR" ]]; then + die "Destination $DEST_DIR already exists; module already ingested?" +fi + +info "Phase A — Ingestion (LOCAL, throwaway)" +info " Module: $MODULE" +info " DestGroup: $DEST_GROUP" +info " Upstream URL: $UPSTREAM_URL" +info " Whitelist: $WHITELIST" +info " ITK source: $ITK_SRC" +info " Style file: $CLANG_FORMAT_STYLE" +[[ -f "$GERSEMI_CONFIG" ]] && info " Gersemi cfg: $GERSEMI_CONFIG" +$DRY_RUN && info " Mode: --dry-run" +$KEEP_TEMPDIR && info " Tempdir: will be preserved" +$SKIP_FORMAT && warn " --skip-format: result will NOT pass pre-commit" + +# -------------------------------------------------------------------- +# Work area +# -------------------------------------------------------------------- +WORKDIR=$(mktemp -d "${TMPDIR:-/tmp}/ingest-v4-$MODULE.XXXXXX") +LOG_DIR="$WORKDIR/logs" +mkdir -p "$LOG_DIR" + +cleanup() { + if $KEEP_TEMPDIR; then + info "Tempdir preserved at: $WORKDIR" + else + rm -rf "$WORKDIR" + fi +} +trap cleanup EXIT + +# -------------------------------------------------------------------- +# Step 1: mirror-clone upstream into a NON-bare clone (filter-repo +# operates on either, but we want a working tree available for +# sanitize-history.py's helper paths). +# -------------------------------------------------------------------- +CLONE="$WORKDIR/$MODULE" +info "Cloning upstream into $CLONE ..." +git clone --quiet --no-local "$UPSTREAM_URL" "$CLONE" + +UPSTREAM_SHA=$(git -C "$CLONE" rev-parse HEAD) +UPSTREAM_DEFAULT_BRANCH=$(git -C "$CLONE" symbolic-ref --short HEAD 2>/dev/null || echo main) +UPSTREAM_COMMIT_COUNT=$(git -C "$CLONE" rev-list --count HEAD) +UPSTREAM_MERGE_COUNT=$(git -C "$CLONE" rev-list --count --merges HEAD) +info " Upstream tip: $UPSTREAM_SHA" +info " Upstream default branch: $UPSTREAM_DEFAULT_BRANCH" +info " Total commits: $UPSTREAM_COMMIT_COUNT" +info " Merge commits: $UPSTREAM_MERGE_COUNT" + +# -------------------------------------------------------------------- +# Step 2: filter-repo whitelist pass — read paths from the whitelist +# file. --paths-from-file accepts shell globs, one per line, # comments. +# -------------------------------------------------------------------- +info "Running filter-repo whitelist pass (--paths-from-file)..." +( + cd "$CLONE" + git filter-repo --force \ + --paths-from-file "$WHITELIST" \ + --prune-empty always +) || die "filter-repo whitelist pass failed" + +# -------------------------------------------------------------------- +# Step 3: subdirectory move into Modules/// +# -------------------------------------------------------------------- +info "Moving into Modules/$DEST_GROUP/$MODULE/ ..." +( + cd "$CLONE" + git filter-repo --force \ + --to-subdirectory-filter "Modules/$DEST_GROUP/$MODULE" \ + --prune-empty always +) || die "filter-repo subdirectory pass failed" + +POST_FILTER_COMMITS=$(git -C "$CLONE" rev-list --count HEAD) +POST_FILTER_MERGES=$(git -C "$CLONE" rev-list --count --merges HEAD) +info " After filter-repo: $POST_FILTER_COMMITS commits, $POST_FILTER_MERGES merges" + +# -------------------------------------------------------------------- +# Step 4: sanitize-history.py — clang-format every C++ blob, black +# every .py blob, prefix every commit subject with a valid ITK prefix. +# -------------------------------------------------------------------- +if $SKIP_FORMAT; then + warn "Skipping sanitize-history.py (per --skip-format)" +else + info "Running sanitize-history.py (clang-format + black + gersemi + ws/eof + prefix)..." + GERSEMI_ARG=() + [[ -f "$GERSEMI_CONFIG" ]] && GERSEMI_ARG=(--gersemi-config "$GERSEMI_CONFIG") + python3 "$SCRIPT_DIR/sanitize-history.py" \ + --repo "$CLONE" \ + --clang-format-style "$CLANG_FORMAT_STYLE" \ + "${GERSEMI_ARG[@]}" \ + --log-dir "$LOG_DIR" \ + || die "sanitize-history.py failed" + info " Sanitize logs in: $LOG_DIR/" +fi + +# -------------------------------------------------------------------- +# Step 5: topology verification. If upstream had merges, the rewritten +# branch MUST still have merges (filter-repo preserves merge topology). +# This guards against regressions where a future change to filter-repo +# args silently linearizes — see feedback_ingest_merge_topology.md. +# -------------------------------------------------------------------- +POST_SANITIZE_COMMITS=$(git -C "$CLONE" rev-list --count HEAD) +POST_SANITIZE_MERGES=$(git -C "$CLONE" rev-list --count --merges HEAD) +info "" +info "Final rewritten history:" +info " Commits: $UPSTREAM_COMMIT_COUNT -> $POST_SANITIZE_COMMITS" +info " Merges: $UPSTREAM_MERGE_COUNT -> $POST_SANITIZE_MERGES" + +if (( UPSTREAM_MERGE_COUNT > 0 )) && (( POST_SANITIZE_MERGES == 0 )); then + die "Topology violation: upstream had $UPSTREAM_MERGE_COUNT merge(s) but rewritten history has 0. + This indicates linearization, which is forbidden per feedback_ingest_merge_topology.md." +fi + +# -------------------------------------------------------------------- +# Step 6: prefix-validation scan. Every commit subject on the rewritten +# branch must start with a valid ITK prefix after sanitize-history.py +# ran. Catch any escapes. +# -------------------------------------------------------------------- +if ! $SKIP_FORMAT; then + info "Verifying every commit subject has a valid ITK prefix..." + BAD_SUBJECTS=$( + git -C "$CLONE" log --format='%H %s' \ + | awk '$2 !~ /^(BUG|COMP|DOC|ENH|PERF|STYLE|WIP):/ { print }' + ) + if [[ -n "$BAD_SUBJECTS" ]]; then + warn "" + warn "=================================================================" + warn " PREFIX VIOLATION: commits without a valid prefix found. " + warn " sanitize-history.py should have caught these — investigate. " + warn "" + printf '%s\n' "$BAD_SUBJECTS" >&2 | head -10 + warn "=================================================================" + die "Refusing to merge into ITK with non-conforming subjects." + fi + info " OK: every commit has a valid prefix." +fi + +# -------------------------------------------------------------------- +# Step 7: dry-run stops here +# -------------------------------------------------------------------- +if $DRY_RUN; then + info "" + info "Dry-run complete. No changes made to ITK working tree." + info "Rewritten clone is at: $CLONE" + info "Sanitize logs at: $LOG_DIR/" + info "" + info "To inspect:" + info " cd $CLONE" + info " git log --oneline --graph | head -50" + info " cat $LOG_DIR/commit-prefix-log.txt | head -30" + exit 0 +fi + +# -------------------------------------------------------------------- +# Step 8: collect authors for Co-authored-by trailers in merge message +# -------------------------------------------------------------------- +info "Collecting author list for Co-authored-by trailers..." +PRIMARY_AUTHOR=$( + git -C "$CLONE" log --format='%an <%ae>' HEAD \ + | sort | uniq -c | sort -rn | head -1 | sed 's/^ *[0-9]* *//' +) +readarray -t ALL_AUTHORS < <(git -C "$CLONE" log --format='%an <%ae>' HEAD | sort -u) +CO_AUTHOR_LINES="" +for a in "${ALL_AUTHORS[@]}"; do + [[ "$a" != "$PRIMARY_AUTHOR" ]] && CO_AUTHOR_LINES+="Co-authored-by: $a"$'\n' +done + +# -------------------------------------------------------------------- +# Step 9: Mode-A merge into the current ITK branch +# -------------------------------------------------------------------- +MERGE_MSG=$(cat < $POST_SANITIZE_MERGES merge(s). + +Primary author: $PRIMARY_AUTHOR + +$CO_AUTHOR_LINES +EOF +) + +info "Merging filter-repo+sanitize output into $(git -C "$ITK_SRC" rev-parse --short HEAD)..." +( + cd "$ITK_SRC" + git remote add ingest-v4-tmp "$CLONE" 2>/dev/null || true + git fetch --quiet ingest-v4-tmp \ + "$UPSTREAM_DEFAULT_BRANCH:ingest-v4-tmp-default" + git merge --allow-unrelated-histories --no-ff \ + -m "$MERGE_MSG" \ + ingest-v4-tmp-default + git remote remove ingest-v4-tmp + git branch -D ingest-v4-tmp-default +) || die "Merge failed" + +# -------------------------------------------------------------------- +# Step 10: post-merge verification +# -------------------------------------------------------------------- +info "" +info "Post-merge verification..." +if ! pre-commit run --all-files >/dev/null 2>&1; then + warn "pre-commit reported issues — re-run interactively to inspect:" + warn " pre-commit run --all-files" + exit 3 +fi +info " OK: pre-commit run --all-files exits 0." + +info "" +info "Phase A complete. Required follow-up commits (in this PR):" +info " 1. DOC: Add Modules/$DEST_GROUP/$MODULE/README.md pointing at archived upstream" +info " 2. COMP: Remove Modules/Remote/$MODULE.remote.cmake" +info " 3. ENH: Add -DModule_$MODULE:BOOL=ON to pyproject.toml configure-ci" +info " 4. Verify local build + tests pass:" +info " pixi run -e cxx configure-ci && pixi run -e cxx build" +info " ctest --test-dir build -R $MODULE" +info "" +info "After this PR has merged into ITK main, run Phase B:" +info " $SCRIPT_DIR/archive-remote-module.sh $MODULE" +info " (Phase B publishes the upstream removal commit + flips archived=true.)" diff --git a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py new file mode 100755 index 00000000000..3157b055429 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py @@ -0,0 +1,539 @@ +#!/usr/bin/env python3 +"""sanitize-history.py — Apply ITK formatting and commit-prefix conventions +to every commit on a filter-repo'd ingestion branch. + +Per the v4 strategy (Phase A, step 5), this script walks the rewritten +ingestion branch and applies the SAME formatters that ITK's +`.pre-commit-config.yaml` enforces, so the rewritten history is gated +through identical rules. Concretely, for every blob: + + Content-aware (filename-blind, sniff-driven): + 1. C/C++ blobs -> `clang-format -style=file` (ITK's .clang-format) + 2. Python blobs -> `black` + 3. CMake blobs -> `gersemi --config .gersemi.config` + 4. All text blobs -> trailing-whitespace strip + + end-of-file fixer (final \\n) + + mixed-line-ending (CRLF -> LF) + 5. Binary / .sha* / .vtk / .vtp / .svg content -> skip (left as-is) + + Commit-message: + Examine the subject; if it does not start with a recognized ITK + prefix (BUG/COMP/DOC/ENH/PERF/STYLE), auto-prepend one based on + a conservative heuristic. Every decision is logged to a sidecar + file for operator audit. + +git-filter-repo's streaming model emits blobs before commits and does +not expose a public "fetch blob by id" API in commit_callback context, +so all blob-content rewriting happens in blob_callback. This is +filename-blind by necessity — ITK's pre-commit excludes (which are +filename-pattern based) are approximated via content-sniffing. For +typical ITK module ingests, the upstream whitelist (via filter-repo's +--paths-from-file step) has already pruned ThirdParty / Data / +build / pixi-cache paths, so the exclude set narrows to a few +content-detectable extensions (.sha / .sha512 / .svg / .vtk / .vtp). + +Usage (invoked from ingest-module-v4.sh; not normally run directly): + + python3 sanitize-history.py \\ + --repo /path/to/filter-repo'd/clone \\ + --clang-format-style /path/to/itk/.clang-format \\ + --gersemi-config /path/to/itk/.gersemi.config \\ + --log-dir /path/to/log/dir +""" + +from __future__ import annotations + +import argparse +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path + +try: + import git_filter_repo as fr +except ImportError: + print( + "Error: git_filter_repo Python module not found.\n" + " Install with: pip install git-filter-repo\n" + " Or: pixi global install git-filter-repo", + file=sys.stderr, + ) + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Commit-subject prefix logic +# --------------------------------------------------------------------------- + +PREFIX_RE = re.compile(rb"^\s*(BUG|COMP|DOC|ENH|PERF|STYLE|WIP)\s*:") + + +def heuristic_prefix(subject: bytes) -> bytes: + s = subject.lower() + if ( + b"fix " in s + or s.startswith(b"fix") + or b" bug" in s + or b"bug " in s + or b"crash" in s + or b"segfault" in s + or b"leak" in s + or b"regression" in s + or b"corrupt" in s + or b"deadlock" in s + ): + return b"BUG" + if ( + b"cmake" in s + or b"compil" in s + or b"warning" in s + or b"build" in s + or b" ci " in s + or s.startswith(b"ci:") + or b" gcc" in s + or b"clang " in s + or b" msvc" in s + ): + return b"COMP" + if ( + b"doc " in s + or b"docs" in s + or s.startswith(b"doc") + or b"readme" in s + or b"comment" in s + or b"license" in s + or b"sphinx" in s + or b"doxygen" in s + ): + return b"DOC" + if ( + b"style" in s + or b"whitespace" in s + or b"format" in s + or b"rename" in s + or b" lint" in s + or s.startswith(b"lint") + or b"clang-format" in s + or b"cleanup" in s + or b"clean up" in s + or b"reorder" in s + ): + return b"STYLE" + if ( + b"perf" in s + or b"optim" in s + or b"speed" in s + or b"faster" in s + or b"efficien" in s + ): + return b"PERF" + return b"ENH" + + +# --------------------------------------------------------------------------- +# Content-type sniffing — filename-blind, conservative +# --------------------------------------------------------------------------- + +CXX_HINTS = ( + b"#include", + b"#pragma", + b"#ifndef ITK", + b"#ifndef itk", + b"#ifndef __itk", + b"namespace itk", + b"namespace ITK", + b"template <", + b"template<", +) + +PY_SHEBANG_HINTS = ( + b"#!/usr/bin/env python", + b"#!/usr/bin/python", + b"#!python", +) +PY_KEYWORD_HINTS = ( + b"\nimport ", + b"\nfrom ", + b"\ndef ", + b"\nclass ", +) + +CMAKE_HINTS = ( + b"cmake_minimum_required", + b"\nproject(", + b"\nadd_library(", + b"\nadd_executable(", + b"\nadd_subdirectory(", + b"\nfind_package(", + b"\ntarget_link_libraries(", + b"\ntarget_include_directories(", + b"\nset(CMAKE_", + b"\ninclude(CMakeParseArguments", + b"\nitk_module(", + b"\nitk_module_impl", + b"\nitk_module_test", + b"\nitk_add_test", + b"\nset_tests_properties", + b"\nitk_wrap_", +) + +# Single-line content-link / hash file signatures +HEX_HASH_RE = re.compile(rb"^[0-9a-f]{32,128}\s*$") + +# Skip-entirely sniffs (content cannot be safely text-fixed) +SKIP_HINTS = ( + b" bool: + """True if the leading bytes contain a NUL — fast text/binary heuristic.""" + return b"\x00" in head[:8192] + + +def is_skip_content(data: bytes, head: bytes) -> bool: + """Return True for content that should be left untouched (CID/sha + content-links, VTK volumes, SVG, etc.).""" + if any(s in head[:512] for s in SKIP_HINTS): + return True + # Single-token hex hash file (CID content-link sidecar) + if data.count(b"\n") <= 1 and HEX_HASH_RE.match(data.strip()): + return True + return False + + +def sniff_kind(data: bytes) -> str | None: + """Return 'cxx', 'py', 'cmake', 'text', or None. + + 'text' = generic plain text — will get only universal whitespace/EOF + fixers, no kind-specific formatter. + None = binary or skip-content (do nothing).""" + if not data: + return None + + head = data[:8192] + if is_binary(head): + return None + if is_skip_content(data, head): + return None + + head_lstripped = head.lstrip() + + if any(head_lstripped.startswith(s) for s in PY_SHEBANG_HINTS): + return "py" + + cxx_hit = any(hint in head for hint in CXX_HINTS) + cmake_hit = any(hint in b"\n" + head for hint in CMAKE_HINTS) + py_hit = any(hint in b"\n" + head for hint in PY_KEYWORD_HINTS) + + # Disambiguate when multiple hits: cxx > cmake > py > generic text + # CXX wins because its hints are highly specific; #include is unambiguous. + # CMake wins over python because cmake_minimum_required + add_library are + # specific, while `import` / `from` could be a comment in CMake. + if cxx_hit: + return "cxx" + if cmake_hit: + return "cmake" + if py_hit: + return "py" + return "text" + + +# --------------------------------------------------------------------------- +# Universal text fixers (correspond to pre-commit-hooks +# trailing-whitespace, end-of-file-fixer, mixed-line-ending) +# --------------------------------------------------------------------------- + + +def fix_mixed_line_ending(data: bytes) -> bytes: + # ITK convention: LF only. Convert CRLF and bare CR -> LF. + return data.replace(b"\r\n", b"\n").replace(b"\r", b"\n") + + +def fix_trailing_whitespace(data: bytes) -> bytes: + # Strip horizontal whitespace from end of every line. + lines = data.split(b"\n") + return b"\n".join(line.rstrip(b" \t") for line in lines) + + +def fix_end_of_file(data: bytes) -> bytes: + if not data: + return data + return data.rstrip(b"\n") + b"\n" + + +def apply_universal_text_fixers(data: bytes) -> bytes: + return fix_end_of_file(fix_trailing_whitespace(fix_mixed_line_ending(data))) + + +# --------------------------------------------------------------------------- +# Formatter wrappers — fail-soft (return original on error) +# --------------------------------------------------------------------------- + + +def fmt_cxx(data: bytes, clang_format_bin: str) -> bytes: + try: + proc = subprocess.run( + [ + clang_format_bin, + "-style=file", + "-assume-filename=ingested.cxx", + ], + input=data, + capture_output=True, + check=True, + timeout=60, + ) + return proc.stdout or data + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ): + return data + + +def fmt_py(data: bytes, black_bin: str) -> bytes: + try: + proc = subprocess.run( + [black_bin, "-q", "-"], + input=data, + capture_output=True, + check=True, + timeout=60, + ) + return proc.stdout or data + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ): + return data + + +def fmt_cmake(data: bytes, gersemi_bin: str, gersemi_config: str | None) -> bytes: + cmd = [gersemi_bin] + if gersemi_config: + cmd += ["--config", gersemi_config] + cmd += ["-"] # stdin -> stdout + try: + proc = subprocess.run( + cmd, + input=data, + capture_output=True, + check=True, + timeout=60, + ) + return proc.stdout or data + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ): + return data + + +# --------------------------------------------------------------------------- +# Stateful sanitizer with logging +# --------------------------------------------------------------------------- + + +class HistorySanitizer: + def __init__( + self, + *, + clang_format_bin: str, + black_bin: str, + gersemi_bin: str, + gersemi_config: str | None, + log_dir: Path, + ): + self.clang_format_bin = clang_format_bin + self.black_bin = black_bin + self.gersemi_bin = gersemi_bin + self.gersemi_config = gersemi_config + self.log_dir = log_dir + self.log_dir.mkdir(parents=True, exist_ok=True) + self.prefix_log = (log_dir / "commit-prefix-log.txt").open("w", buffering=1) + self.format_log = (log_dir / "format-actions.log").open("w", buffering=1) + self.commit_count = 0 + self.prefix_changes = 0 + self.blob_count = 0 + self.format_changes = 0 + self.kind_counts: dict[str, int] = {} + + # -- callbacks ----------------------------------------------------------- + + def blob_callback(self, blob: fr.Blob, _metadata: dict) -> None: + self.blob_count += 1 + kind = sniff_kind(blob.data) + if kind is None: + return + self.kind_counts[kind] = self.kind_counts.get(kind, 0) + 1 + + original_data = blob.data + original_len = len(original_data) + + # Step 1: kind-specific formatter + if kind == "cxx": + new_data = fmt_cxx(original_data, self.clang_format_bin) + elif kind == "py": + new_data = fmt_py(original_data, self.black_bin) + elif kind == "cmake": + new_data = fmt_cmake(original_data, self.gersemi_bin, self.gersemi_config) + else: # 'text' + new_data = original_data + + # Step 2: universal text fixers — applied to every text blob, + # even ones the kind-specific formatter already passed over. + new_data = apply_universal_text_fixers(new_data) + + if new_data == original_data: + return + + self.format_changes += 1 + try: + self.format_log.write( + f"{kind:5s} {original_len:8d} -> {len(new_data):8d} " + f"{(blob.original_id or b'').decode('utf-8', 'replace')[:12]}\n" + ) + except Exception: + pass + blob.data = new_data + + def commit_callback(self, commit: fr.Commit, _metadata: dict) -> None: + self.commit_count += 1 + msg = commit.message + nl = msg.find(b"\n") + subject = msg if nl == -1 else msg[:nl] + rest = b"" if nl == -1 else msg[nl:] + if PREFIX_RE.match(subject): + return + prefix = heuristic_prefix(subject) + commit.message = prefix + b": " + subject + rest + self.prefix_changes += 1 + try: + old = subject.decode("utf-8", "replace").strip()[:120] + sha = (commit.original_id or b"").decode("utf-8", "replace") + self.prefix_log.write(f"{sha[:12]} {prefix.decode()}: {old}\n") + except Exception: + pass + + def close(self) -> None: + for f in (self.prefix_log, self.format_log): + try: + f.close() + except Exception: + pass + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--repo", type=Path, required=True) + p.add_argument("--clang-format-style", type=Path, required=True) + p.add_argument( + "--gersemi-config", + type=Path, + default=None, + help="Path to ITK's .gersemi.config (optional but strongly recommended)", + ) + p.add_argument("--log-dir", type=Path, required=True) + p.add_argument( + "--clang-format-bin", default=os.environ.get("CLANG_FORMAT_BIN", "clang-format") + ) + p.add_argument("--black-bin", default=os.environ.get("BLACK_BIN", "black")) + p.add_argument("--gersemi-bin", default=os.environ.get("GERSEMI_BIN", "gersemi")) + args = p.parse_args() + + if not args.repo.exists(): + print(f"Error: repo path does not exist: {args.repo}", file=sys.stderr) + return 2 + if not args.clang_format_style.exists(): + print( + f"Error: clang-format file does not exist: {args.clang_format_style}", + file=sys.stderr, + ) + return 2 + for binname, binval in ( + ("clang-format", args.clang_format_bin), + ("black", args.black_bin), + ("gersemi", args.gersemi_bin), + ): + if shutil.which(binval) is None: + print(f"Error: {binname} not found at {binval}", file=sys.stderr) + return 2 + + target_style = args.repo / ".clang-format" + if not target_style.exists(): + target_style.write_bytes(args.clang_format_style.read_bytes()) + + gersemi_config_path: str | None = None + if args.gersemi_config: + if not args.gersemi_config.exists(): + print( + f"Error: gersemi config does not exist: {args.gersemi_config}", + file=sys.stderr, + ) + return 2 + gersemi_config_path = str(args.gersemi_config) + else: + print( + "Warning: --gersemi-config not supplied; gersemi will run with defaults.", + file=sys.stderr, + ) + + print(f"sanitize-history: rewriting commits in {args.repo}", file=sys.stderr) + print(f" clang-format: {args.clang_format_bin}", file=sys.stderr) + print(f" black: {args.black_bin}", file=sys.stderr) + print(f" gersemi: {args.gersemi_bin}", file=sys.stderr) + print(f" style file: {args.clang_format_style}", file=sys.stderr) + print(f" gersemi config: {gersemi_config_path or '(default)'}", file=sys.stderr) + print(f" log dir: {args.log_dir}", file=sys.stderr) + + os.chdir(args.repo) + + sanitizer = HistorySanitizer( + clang_format_bin=args.clang_format_bin, + black_bin=args.black_bin, + gersemi_bin=args.gersemi_bin, + gersemi_config=gersemi_config_path, + log_dir=args.log_dir, + ) + + filter_args = fr.FilteringOptions.parse_args(["--force"]) + repo_filter = fr.RepoFilter( + filter_args, + blob_callback=sanitizer.blob_callback, + commit_callback=sanitizer.commit_callback, + ) + + try: + repo_filter.run() + finally: + sanitizer.close() + + kc = sanitizer.kind_counts + print( + f"\nsanitize-history complete:\n" + f" commits walked: {sanitizer.commit_count}\n" + f" prefix auto-prepended: {sanitizer.prefix_changes}\n" + f" blobs scanned: {sanitizer.blob_count}\n" + f" blobs reformatted: {sanitizer.format_changes}\n" + f" by-kind sniff: " + f"cxx={kc.get('cxx', 0)} " + f"py={kc.get('py', 0)} " + f"cmake={kc.get('cmake', 0)} " + f"text={kc.get('text', 0)} " + f"skip={sanitizer.blob_count - sum(kc.values())}\n" + f" logs: {args.log_dir}/", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshSTL.list b/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshSTL.list new file mode 100644 index 00000000000..24019b6adfc --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshSTL.list @@ -0,0 +1,25 @@ +# IOMeshSTL — files that migrate into ITK at Modules/IO/MeshSTL/ +# +# Format: one path per line, relative to the upstream repo root. +# Globs supported via filter-repo --paths-from-file (treated as +# path prefixes; trailing /** matches everything beneath the dir). +# Lines starting with '#' are comments; blank lines ignored. + +# Public headers +include + +# Implementation sources +src + +# Tests + fixtures +test + +# Python wrapping +wrapping + +# CMake / module metadata +CMakeLists.txt +itk-module.cmake + +# License (always migrates with the code) +LICENSE From 85a1065fbddbca933bee3c22991e211a3948fcba Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Mon, 4 May 2026 13:38:31 -0500 Subject: [PATCH 02/15] COMP: Add scaffolding deny-pass to v4 ingest driver The whitelist admits whole directories (test/, wrapping/, ...) but some upstream remote modules place CI / packaging scaffolding inside those (e.g. test/Docker/, .github/, azure-pipelines/). Without a deny-pass after the whitelist filter, those leak into ITK history. Mirrors v3's --invert-paths --path-glob pass; discovered missing on the first Cuberille v4 ingest (test/Docker/{Dockerfile,build,run,test}.sh leaked). --- .../RemoteModuleIngest/ingest-module-v4.sh | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh index baf3489134b..0b2b9663f72 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh +++ b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh @@ -196,6 +196,53 @@ info "Running filter-repo whitelist pass (--paths-from-file)..." --prune-empty always ) || die "filter-repo whitelist pass failed" +# -------------------------------------------------------------------- +# Step 2b: scaffolding deny-pass. The whitelist admits whole +# directories (test/, wrapping/, ...) but upstream remote modules +# sometimes place CI / packaging scaffolding inside those (e.g. +# test/Docker/, wrapping/azure-pipelines.yml, .github/ inside test/). +# Strip those out across all history with a per-commit invert-glob +# pass. Without this, scaffolding leaks into ITK's history; bug +# discovered on the first Cuberille v4 ingest (test/Docker/* leaked). +# -------------------------------------------------------------------- +info "Running scaffolding deny-pattern strip pass..." +( + cd "$CLONE" + git filter-repo --force \ + --invert-paths \ + --path-glob '**/CTestConfig.cmake' \ + --path-glob '**/azure-pipelines*.yml' \ + --path-glob '**/azure-pipelines/*' \ + --path-glob '**/Dockerfile' \ + --path-glob '**/Dockerfile.*' \ + --path-glob '**/Dockerfile-*' \ + --path-glob '**/.dockerignore' \ + --path-glob '**/[Dd]ocker/*' \ + --path-glob '**/.[Dd]ocker/*' \ + --path-glob '**/Jenkinsfile' \ + --path-glob '**/.circleci/*' \ + --path-glob '**/circle.yml' \ + --path-glob '**/.travis.yml' \ + --path-glob '**/appveyor.yml' \ + --path-glob '**/.appveyor.yml' \ + --path-glob '**/.cirun.yml' \ + --path-glob '**/.gitlab-ci.yml' \ + --path-glob '**/.github/*' \ + --path-glob '**/codecov.yml' \ + --path-glob '**/.codecov.yml' \ + --path-glob '**/tox.ini' \ + --path-glob '**/pyproject.toml' \ + --path-glob '**/setup.py' \ + --path-glob '**/setup.cfg' \ + --path-glob '**/MANIFEST.in' \ + --path-glob '**/requirements*.txt' \ + --path-glob '**/environment*.yml' \ + --path-glob '**/.clang-format' \ + --path-glob '**/.clang-tidy' \ + --path-glob '**/.pre-commit-config.yaml' \ + --prune-empty always +) || die "filter-repo deny-pass failed" + # -------------------------------------------------------------------- # Step 3: subdirectory move into Modules/// # -------------------------------------------------------------------- From 68c2fd8f37d1fc3acd6bf899ff37e37305fffc70 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Mon, 4 May 2026 14:05:02 -0500 Subject: [PATCH 03/15] COMP: Tighten v4 ingest to satisfy ITK ghostflow checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the four classes of ghostflow errors that surfaced on the first Cuberille v4 ingest (PR #6205) — the only allowable remaining error should be the unavoidable upstream root commit. - Strip *.orig / *.rej / *.BACKUP.* / *.LOCAL.* / *.REMOTE.* / *.BASE.* merge-conflict-artifact files from history (catches leftover-conflict-marker errors like the .h.orig on Cuberille). - sanitize-history.py: cap every commit subject at 78 chars (ghostflow / kw-commit-msg rule); excess words move into the body. - sanitize-history.py: insert a blank line between subject and body when the original commit message lacked one (two-line subject error). - sanitize-history.py: clear the executable bit (mode 100755 -> 100644) on text-file extensions so root-commit "executable permissions but file does not look executable" stops firing. --- .../RemoteModuleIngest/ingest-module-v4.sh | 6 + .../RemoteModuleIngest/sanitize-history.py | 151 ++++++++++++++++-- 2 files changed, 146 insertions(+), 11 deletions(-) diff --git a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh index 0b2b9663f72..07fbe77fe1f 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh +++ b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh @@ -240,6 +240,12 @@ info "Running scaffolding deny-pattern strip pass..." --path-glob '**/.clang-format' \ --path-glob '**/.clang-tidy' \ --path-glob '**/.pre-commit-config.yaml' \ + --path-glob '**/*.orig' \ + --path-glob '**/*.rej' \ + --path-glob '**/*.BACKUP.*' \ + --path-glob '**/*.LOCAL.*' \ + --path-glob '**/*.REMOTE.*' \ + --path-glob '**/*.BASE.*' \ --prune-empty always ) || die "filter-repo deny-pass failed" diff --git a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py index 3157b055429..93340e21033 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py +++ b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py @@ -68,6 +68,42 @@ # --------------------------------------------------------------------------- PREFIX_RE = re.compile(rb"^\s*(BUG|COMP|DOC|ENH|PERF|STYLE|WIP)\s*:") +SUBJECT_MAX_LEN = 78 # ITK ghostflow / kw-commit-msg rule +SUBJECT_WORDBREAK_MIN = 40 # only break at a space past this offset + + +def truncate_subject_to_body(subject: bytes, rest: bytes) -> tuple[bytes, bytes, bool]: + """If the subject exceeds SUBJECT_MAX_LEN, truncate it (preferring a word + break) and prepend the excess to the body. Returns (new_subject, new_rest, + changed).""" + if len(subject) <= SUBJECT_MAX_LEN: + return subject, rest, False + cut = SUBJECT_MAX_LEN + space_pos = subject.rfind(b" ", 0, SUBJECT_MAX_LEN) + if space_pos > SUBJECT_WORDBREAK_MIN: + cut = space_pos + head = subject[:cut].rstrip() + tail = subject[cut:].lstrip() + if not tail: + return head, rest, True + if rest: + # rest starts with at least one '\n' from the original separator + new_rest = b"\n\n" + tail + rest + else: + new_rest = b"\n\n" + tail + b"\n" + return head, new_rest, True + + +def ensure_blank_line_after_subject(rest: bytes) -> tuple[bytes, bool]: + """Subject must be followed by a blank line if a body exists. rest, when + non-empty, starts with '\\n' (the original line separator). If the next + char isn't also '\\n', insert one to make it a proper blank line.""" + if not rest: + return rest, False + if rest.startswith(b"\n\n"): + return rest, False + # rest starts with single '\n' followed by body content; convert to "\n\n" + return b"\n" + rest, True def heuristic_prefix(subject: bytes) -> bytes: @@ -315,6 +351,58 @@ def fmt_py(data: bytes, black_bin: str) -> bytes: return data +TEXT_FILE_EXTS = ( + b".c", + b".cc", + b".cpp", + b".cxx", + b".h", + b".hpp", + b".hxx", + b".txx", + b".tcc", + b".py", + b".cmake", + b".wrap", + b".md", + b".rst", + b".txt", + b".yml", + b".yaml", + b".json", + b".toml", + b".ini", + b".cfg", + b".in", + b".tex", + b".bib", + b".css", + b".html", + b".xml", + b".dox", +) +TEXT_FILE_BASENAMES = ( + b"cmakelists.txt", + b"license", + b"readme", + b"copying", + b"authors", + b"changelog", + b"itk-module.cmake", +) + + +def filename_is_text(filename: bytes) -> bool: + """Return True for filenames that are clearly source/text and should + therefore never carry the executable bit (used to fix mode-bit drift in + upstream module commits).""" + lower = filename.lower() + if any(lower.endswith(e) for e in TEXT_FILE_EXTS): + return True + base = lower.rsplit(b"/", 1)[-1] + return base in TEXT_FILE_BASENAMES + + def fmt_cmake(data: bytes, gersemi_bin: str, gersemi_config: str | None) -> bytes: cmd = [gersemi_bin] if gersemi_config: @@ -362,6 +450,9 @@ def __init__( self.format_log = (log_dir / "format-actions.log").open("w", buffering=1) self.commit_count = 0 self.prefix_changes = 0 + self.subject_truncations = 0 + self.subject_blank_inserts = 0 + self.mode_normalizations = 0 self.blob_count = 0 self.format_changes = 0 self.kind_counts: dict[str, int] = {} @@ -407,21 +498,56 @@ def blob_callback(self, blob: fr.Blob, _metadata: dict) -> None: def commit_callback(self, commit: fr.Commit, _metadata: dict) -> None: self.commit_count += 1 + + # ---- subject sanitization ---- msg = commit.message nl = msg.find(b"\n") subject = msg if nl == -1 else msg[:nl] rest = b"" if nl == -1 else msg[nl:] - if PREFIX_RE.match(subject): - return - prefix = heuristic_prefix(subject) - commit.message = prefix + b": " + subject + rest - self.prefix_changes += 1 - try: - old = subject.decode("utf-8", "replace").strip()[:120] - sha = (commit.original_id or b"").decode("utf-8", "replace") - self.prefix_log.write(f"{sha[:12]} {prefix.decode()}: {old}\n") - except Exception: - pass + + # Step 1: ensure ITK prefix + prepended = False + if not PREFIX_RE.match(subject): + prefix = heuristic_prefix(subject) + subject = prefix + b": " + subject + self.prefix_changes += 1 + prepended = True + try: + old = ( + (msg if nl == -1 else msg[:nl]) + .decode("utf-8", "replace") + .strip()[:120] + ) + sha = (commit.original_id or b"").decode("utf-8", "replace") + self.prefix_log.write(f"{sha[:12]} {prefix.decode()}: {old}\n") + except Exception: + pass + + # Step 2: enforce 78-char subject limit (ghostflow / kw-commit-msg) + subject, rest, truncated = truncate_subject_to_body(subject, rest) + if truncated: + self.subject_truncations += 1 + + # Step 3: ensure subject is followed by a blank line if a body exists + rest, blank_inserted = ensure_blank_line_after_subject(rest) + if blank_inserted: + self.subject_blank_inserts += 1 + + if prepended or truncated or blank_inserted: + commit.message = subject + rest + + # ---- file-mode normalization (clear executable bit on text files) ---- + if commit.file_changes: + for change in commit.file_changes: + if change.type not in (b"M", b"A"): + continue + if not change.filename or not change.mode: + continue + if change.mode != b"100755": + continue + if filename_is_text(change.filename): + change.mode = b"100644" + self.mode_normalizations += 1 def close(self) -> None: for f in (self.prefix_log, self.format_log): @@ -521,6 +647,9 @@ def main() -> int: f"\nsanitize-history complete:\n" f" commits walked: {sanitizer.commit_count}\n" f" prefix auto-prepended: {sanitizer.prefix_changes}\n" + f" subjects truncated: {sanitizer.subject_truncations}\n" + f" blank lines inserted: {sanitizer.subject_blank_inserts}\n" + f" exec-bits cleared: {sanitizer.mode_normalizations}\n" f" blobs scanned: {sanitizer.blob_count}\n" f" blobs reformatted: {sanitizer.format_changes}\n" f" by-kind sniff: " From 393cb7b5a9d8a53a00274668d07344348d7e6424 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Mon, 4 May 2026 14:37:44 -0500 Subject: [PATCH 04/15] COMP: v4 sanitize patches itk-module README.rst->README.md references Many ITK remote modules' itk-module.cmake reads file(READ "${MY_CURRENT_DIR}/README.rst" DOCUMENTATION) to populate the CMake DESCRIPTION variable from the upstream README. The v4 whitelist intentionally excludes the upstream README (the operator ships a new README.md as part of the ingest PR), so without this rewrite every commit's CMakeLists fails to configure post-ingest until the operator manually patches itk-module.cmake. sanitize-history.py now rewrites the file(READ ... README.rst ...) reference to README.md as part of the cmake-blob transform, so every historical commit on the rewritten branch is independently buildable. Surfaced on the IOMeshSTL ingest (PR following PR #6204). --- .../RemoteModuleIngest/sanitize-history.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py index 93340e21033..7fd0d9da22c 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py +++ b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py @@ -403,6 +403,25 @@ def filename_is_text(filename: bytes) -> bool: return base in TEXT_FILE_BASENAMES +# Many ITK remote-module itk-module.cmake files do +# file(READ "${MY_CURRENT_DIR}/README.rst" DOCUMENTATION) +# to populate the CMake `DESCRIPTION` variable. The v4 whitelist +# excludes the upstream README (the operator ships a new README.md as +# part of the ingest PR), so leaving that line as-is breaks every +# historical commit's build. Rewriting the reference to README.md +# makes every commit independently buildable in the ingested tree. +README_RST_REF_RE = re.compile( + rb"(file\s*\(\s*READ\s+[^)]*?)README\.rst", re.IGNORECASE +) + + +def patch_readme_reference(data: bytes) -> tuple[bytes, bool]: + """Rewrite `file(READ ".../README.rst" ...)` -> README.md. Returns + (new_data, changed).""" + new_data, n = README_RST_REF_RE.subn(rb"\1README.md", data) + return new_data, n > 0 + + def fmt_cmake(data: bytes, gersemi_bin: str, gersemi_config: str | None) -> bytes: cmd = [gersemi_bin] if gersemi_config: @@ -453,6 +472,7 @@ def __init__( self.subject_truncations = 0 self.subject_blank_inserts = 0 self.mode_normalizations = 0 + self.readme_ref_patches = 0 self.blob_count = 0 self.format_changes = 0 self.kind_counts: dict[str, int] = {} @@ -476,6 +496,9 @@ def blob_callback(self, blob: fr.Blob, _metadata: dict) -> None: new_data = fmt_py(original_data, self.black_bin) elif kind == "cmake": new_data = fmt_cmake(original_data, self.gersemi_bin, self.gersemi_config) + new_data, readme_patched = patch_readme_reference(new_data) + if readme_patched: + self.readme_ref_patches += 1 else: # 'text' new_data = original_data @@ -650,6 +673,7 @@ def main() -> int: f" subjects truncated: {sanitizer.subject_truncations}\n" f" blank lines inserted: {sanitizer.subject_blank_inserts}\n" f" exec-bits cleared: {sanitizer.mode_normalizations}\n" + f" README.rst->.md fixes: {sanitizer.readme_ref_patches}\n" f" blobs scanned: {sanitizer.blob_count}\n" f" blobs reformatted: {sanitizer.format_changes}\n" f" by-kind sniff: " From 11b1928042fc55f610d4ad9557d4f36bfa1842c1 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 05:51:58 -0500 Subject: [PATCH 05/15] =?UTF-8?q?DOC:=20v4=20strategy=20=E2=80=94=20record?= =?UTF-8?q?=20ghostflow=20root-commit=20+=20Greptile=20review=20patterns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 'Known artifacts at PR-review time' section to capture two things future ingest operators need to expect but that v3/v4 do not fix automatically: - The single ghostflow-check-main 'root commit not allowed' error that every Mode-A merge produces. Maintainers override at merge. Calling it out keeps operators from chasing a non-fix. - A short table of code-level patterns Greptile has flagged post-ingest on multiple modules (IOMeshSTL #6206, RLEImage #6208): signed-vs-unsigned size types, missing override, dead return after itkExceptionMacro, stray test args, external-friend declarations, GetBuffer-const correctness, and allocator-init invariants. The table is the durable channel for cross-ingest review knowledge — extend it as new recurring patterns surface. --- .../INGESTION_STRATEGY_v4.md | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md index bf53f8eeb0a..61f2e9ba14c 100644 --- a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md +++ b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md @@ -240,6 +240,53 @@ sidecar log; the cost of `ENH: STYLE-cleanup-of-headers` slipping past review is much lower than the cost of `STYLE: Add new IO format support` mis-prefixed. +## Known artifacts at PR-review time + +### Unavoidable: ghostflow root-commit failure + +The Mode-A `--allow-unrelated-histories` merge brings the upstream +module's whole lineage into the PR diff, including its first commit +(zero parents). ITK's `ghostflow-check-main` rejects every PR that +introduces a root commit: + +``` +commit not allowed; it is a root commit. +``` + +This single error is **expected and accepted** for every v4 ingest — +maintainers override at merge time. Removing the root would force +linearization (squash or rebase-without-merges) and forfeit the +merge-topology requirement (per `feedback_ingest_merge_topology.md`), +which the project considers a worse trade. + +If `ghostflow-check-main` reports any error other than the single +root-commit line, that error **is** a real problem for the operator to +fix before maintainer review. Keep an eye on the message body. + +### Code-level patterns commonly flagged by Greptile post-ingest + +Phase A's per-commit gates fix style and prefix issues but not +semantics. These patterns recur across upstream remote modules and +each requires a human-judgment fix once the ingest is in PR form. +The list is intentionally short — only patterns observed on multiple +v3/v4 ingests are listed. Treat it as the operator's pre-ready-for- +review checklist, not as something to auto-fix. + +| Pattern | Where seen | Resolution | +|---|---|---| +| Signed `int32_t` / `int` used for spec-defined unsigned counts (e.g. file-format triangle counts, voxel counts) | IOMeshSTL #6206 | Switch to `uint32_t` / `SizeValueType`; retype `ByteSwapper<>`; verify decrement loops | +| Missing `override` specifier on virtual base-class method overrides | (universal) | Add `override`; if base method is non-virtual, document why the derived method shadows | +| Dead `return;` after `itkExceptionMacro(...)` | IOMeshSTL #6206 | `itkExceptionMacro` throws; the trailing `return;` is unreachable — delete | +| `inputFilename=` in error path that opens the **output** stream (and vice-versa) | IOMeshSTL #6206 | Cosmetic but misleading; correct the label | +| Test pipeline `Update()` not wrapped in `ITK_TRY_EXPECT_NO_EXCEPTION` | IOMeshSTL #6206 | Wrap so failure surfaces with a descriptive message instead of an unhandled-exception crash | +| `GetBuffer() const` returning a non-`const` smart pointer (allows mutation through a `const` receiver) | RLEImage #6208 | Return `BufferType::ConstPointer`; consider whether `m_Buffer`'s `mutable` is necessary | +| Global-namespace forward declaration + `friend class ::Foo;` inside an ITK header, where `Foo` is not part of ITK | RLEImage #6208 | Strip both the forward decl and the `friend` grant; they are remote-module-internal coupling | +| Stray positional argument after the function name in `itk_add_test(... COMMAND ...Driver fnName )` | RLEImage #6208 | Test driver passes `argv[2..n]` to `fnName` as arguments — it does not invoke a second function. Remove the stray, register the second test separately if needed | +| Allocator overrides whose `bool initialize` parameter is silently ignored | RLEImage #6208 | Verify the override has an internal invariant requiring unconditional init **before** "fixing" — for some module types (e.g. RLEImage's RLE-line buffer) the unconditional fill IS the contract; document it instead | + +When an upcoming ingest hits a new pattern that recurs across modules, +extend this table; it is the durable channel for v4 review-knowledge. + ## What this strategy explicitly does NOT change from v3 - The Mode-A (`--no-ff --allow-unrelated-histories`) merge into ITK From 67a965a01e19d32e5ee88b0875653fb21edf6d7a Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 06:11:42 -0500 Subject: [PATCH 06/15] =?UTF-8?q?COMP:=20v4=20deny-pass=20=E2=80=94=20stri?= =?UTF-8?q?p=20ExternalData=20fetch=20cache=20from=20history?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ExternalData stores resolved fetched content as .ExternalData__ files alongside the .cid / .md5 sidecars they correspond to. These are local fetch cache, not source — ExternalData regenerates them on demand at consumer-time. Upstream remote modules sometimes commit them inadvertently from a CTest run inside their working tree (e.g. ITKIOMeshSTL had four such files in test/Baseline/, flagged by @dzenanz on PR #6206). Add **/.ExternalData_* to the scaffolding deny-pass so they never enter ITK history. --- Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh index 07fbe77fe1f..056c33a007e 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh +++ b/Utilities/Maintenance/RemoteModuleIngest/ingest-module-v4.sh @@ -246,6 +246,7 @@ info "Running scaffolding deny-pattern strip pass..." --path-glob '**/*.LOCAL.*' \ --path-glob '**/*.REMOTE.*' \ --path-glob '**/*.BASE.*' \ + --path-glob '**/.ExternalData_*' \ --prune-empty always ) || die "filter-repo deny-pass failed" From 7f16e2f9df41e66c8b7d566dc85fd894d2f878a6 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 06:11:46 -0500 Subject: [PATCH 07/15] =?UTF-8?q?COMP:=20v4=20sanitize=20=E2=80=94=20unwra?= =?UTF-8?q?p=20standalone-build=20CMake=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream remote modules wrap their top-level CMakeLists.txt in if(NOT ITK_SOURCE_DIR) find_package(ITK REQUIRED) list(APPEND CMAKE_MODULE_PATH ${ITK_CMAKE_DIR}) include(ITKModuleExternal) else() itk_module_impl() endif() so the same file works both as a fetched remote module and as a stand-alone CMake project. In an in-tree ingested module ITK_SOURCE_DIR is always defined, so the if-branch is dead code; @dzenanz flagged it on PR #6206 (IOMeshSTL). Add patch_standalone_build_guard() that detects the idiom (allowing 2- or 4-space indent variants) and replaces it with a bare itk_module_impl() before gersemi formatting runs. Track count via sanitizer.standalone_guard_patches and print it in the run summary. --- .../RemoteModuleIngest/sanitize-history.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py index 7fd0d9da22c..a23717e0354 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py +++ b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py @@ -422,6 +422,45 @@ def patch_readme_reference(data: bytes) -> tuple[bytes, bool]: return new_data, n > 0 +# Upstream remote modules wrap their top-level CMakeLists.txt in a +# standalone-build guard so the same file works both as a +# remote-fetched module and as a stand-alone CMake project: +# +# if(NOT ITK_SOURCE_DIR) +# find_package(ITK REQUIRED) +# list(APPEND CMAKE_MODULE_PATH ${ITK_CMAKE_DIR}) +# include(ITKModuleExternal) +# else() +# itk_module_impl() +# endif() +# +# In an in-tree ingested module, ITK_SOURCE_DIR is always defined, so +# the if-branch is dead code. Reviewers flag it on every ingest (e.g. +# @dzenanz on PR #6206 IOMeshSTL). Strip the wrapper here so it is +# never present in the ingest history. +STANDALONE_BUILD_GUARD_RE = re.compile( + rb"if\s*\(\s*NOT\s+ITK_SOURCE_DIR\s*\)[ \t]*\n" + rb"[ \t]*find_package\s*\(\s*ITK\s+REQUIRED\s*\)[ \t]*\n" + rb"[ \t]*list\s*\(\s*APPEND\s+CMAKE_MODULE_PATH\s+" + rb"\$\{\s*ITK_CMAKE_DIR\s*\}\s*\)[ \t]*\n" + rb"[ \t]*include\s*\(\s*ITKModuleExternal\s*\)[ \t]*\n" + rb"[ \t]*else\s*\(\s*\)[ \t]*\n" + rb"([ \t]*)itk_module_impl\s*\(\s*\)[ \t]*\n" + rb"[ \t]*endif\s*\(\s*\)", + re.IGNORECASE, +) + + +def patch_standalone_build_guard(data: bytes) -> tuple[bytes, bool]: + """Replace the `if(NOT ITK_SOURCE_DIR) ... else() itk_module_impl() endif()` + standalone-build wrapper with a bare `itk_module_impl()`. Returns + (new_data, changed). The replacement preserves the indentation of the + inner `itk_module_impl()` line so post-rewrite gersemi has a stable + starting point.""" + new_data, n = STANDALONE_BUILD_GUARD_RE.subn(rb"\1itk_module_impl()", data) + return new_data, n > 0 + + def fmt_cmake(data: bytes, gersemi_bin: str, gersemi_config: str | None) -> bytes: cmd = [gersemi_bin] if gersemi_config: @@ -473,6 +512,7 @@ def __init__( self.subject_blank_inserts = 0 self.mode_normalizations = 0 self.readme_ref_patches = 0 + self.standalone_guard_patches = 0 self.blob_count = 0 self.format_changes = 0 self.kind_counts: dict[str, int] = {} @@ -495,7 +535,10 @@ def blob_callback(self, blob: fr.Blob, _metadata: dict) -> None: elif kind == "py": new_data = fmt_py(original_data, self.black_bin) elif kind == "cmake": - new_data = fmt_cmake(original_data, self.gersemi_bin, self.gersemi_config) + new_data, guard_unwrapped = patch_standalone_build_guard(original_data) + if guard_unwrapped: + self.standalone_guard_patches += 1 + new_data = fmt_cmake(new_data, self.gersemi_bin, self.gersemi_config) new_data, readme_patched = patch_readme_reference(new_data) if readme_patched: self.readme_ref_patches += 1 @@ -674,6 +717,7 @@ def main() -> int: f" blank lines inserted: {sanitizer.subject_blank_inserts}\n" f" exec-bits cleared: {sanitizer.mode_normalizations}\n" f" README.rst->.md fixes: {sanitizer.readme_ref_patches}\n" + f" standalone-guard rm: {sanitizer.standalone_guard_patches}\n" f" blobs scanned: {sanitizer.blob_count}\n" f" blobs reformatted: {sanitizer.format_changes}\n" f" by-kind sniff: " From 9ff730896190206b3fbc78cd816ebba43de857b5 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 06:11:50 -0500 Subject: [PATCH 08/15] =?UTF-8?q?DOC:=20v4=20strategy=20=E2=80=94=20split?= =?UTF-8?q?=20auto-sanitized=20vs.=20operator-fix=20patterns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'Greptile post-ingest patterns' table mixed mechanical artifacts (strippable in Phase A) with semantic concerns (require human judgment). Split into two sections so the operator's checklist shrinks as the sanitizer learns more rules. Promote .ExternalData_* and the standalone-build CMake guard to the auto-sanitized table now that the previous two commits handle them. --- .../INGESTION_STRATEGY_v4.md | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md index 61f2e9ba14c..67fb546139a 100644 --- a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md +++ b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md @@ -263,14 +263,29 @@ If `ghostflow-check-main` reports any error other than the single root-commit line, that error **is** a real problem for the operator to fix before maintainer review. Keep an eye on the message body. +### Patterns Phase A now sanitizes automatically + +The following recurring upstream artifacts are stripped or rewritten +by the v4 sanitizer; operators do not need to fix them by hand: + +| Artifact | Where handled | Why | +|---|---|---| +| `**/.ExternalData_*` (e.g. `.ExternalData_MD5_` baseline cache) | `ingest-module-v4.sh` deny-pass | Local fetch-cache from upstream's CTest run; ExternalData regenerates from `.cid` / `.md5` sidecars on demand. (@dzenanz, PR #6206) | +| `if(NOT ITK_SOURCE_DIR) ... find_package(ITK) ... else() itk_module_impl() endif()` standalone-build guard in module `CMakeLists.txt` | `sanitize-history.py:patch_standalone_build_guard` | In-tree, `ITK_SOURCE_DIR` is always defined; the if-branch is dead code. (@dzenanz, PR #6206) | +| `README.rst` references in CMake `file(READ ...)` calls | `sanitize-history.py:patch_readme_reference` | Phase B archival promotes `MIGRATION_README.md` to `README.md`; in-tree consumers read the markdown form | +| `*.orig`, `*.rej`, `*.BACKUP.*`, `*.LOCAL.*`, `*.REMOTE.*`, `*.BASE.*` | deny-pass | Leftover merge-conflict artifacts | +| Scaffolding (`Dockerfile*`, `azure-pipelines*.yml`, `.github/`, `.travis.yml`, `.circleci/`, `tox.ini`, `pyproject.toml`, `setup.py`, `.clang-format`, `.pre-commit-config.yaml`, …) | deny-pass | Module's per-repo CI/packaging is irrelevant in-tree | + +Each sanitizer prints a ` patches` line in the run summary so +the operator can confirm the rule fired (or didn't) on a given module. + ### Code-level patterns commonly flagged by Greptile post-ingest -Phase A's per-commit gates fix style and prefix issues but not -semantics. These patterns recur across upstream remote modules and -each requires a human-judgment fix once the ingest is in PR form. -The list is intentionally short — only patterns observed on multiple +The patterns below still require **human judgment** to fix — they're +semantic, not mechanical, and the v4 sanitizer leaves them alone. +The list is intentionally short: only patterns observed on multiple v3/v4 ingests are listed. Treat it as the operator's pre-ready-for- -review checklist, not as something to auto-fix. +review checklist. | Pattern | Where seen | Resolution | |---|---|---| From cea536fb96aa214b4e855f571891067d8ac020f5 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 06:49:43 -0500 Subject: [PATCH 09/15] ENH: Add v4 PolarTransform whitelist (header-only, mirrors Cuberille) --- .../whitelists/PolarTransform.list | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/whitelists/PolarTransform.list diff --git a/Utilities/Maintenance/RemoteModuleIngest/whitelists/PolarTransform.list b/Utilities/Maintenance/RemoteModuleIngest/whitelists/PolarTransform.list new file mode 100644 index 00000000000..60e6f4ed84f --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/whitelists/PolarTransform.list @@ -0,0 +1,21 @@ +# PolarTransform — files that migrate into ITK at Modules/Filtering/PolarTransform/ +# +# Format: one path per line, relative to the upstream repo root. +# Globs supported via filter-repo --paths-from-file. +# Lines starting with '#' are comments; blank lines ignored. + +# Public headers (PolarTransform is header-only; no src/) +include + +# Tests + fixtures +test + +# Python wrapping +wrapping + +# CMake / module metadata +CMakeLists.txt +itk-module.cmake + +# License (always migrates with the code) +LICENSE From bd785b0314f97030e96d8214be83b7dcdd958d02 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 07:08:25 -0500 Subject: [PATCH 10/15] ENH: Add v4 SplitComponents whitelist (header-only, mirrors Cuberille) --- .../whitelists/SplitComponents.list | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/whitelists/SplitComponents.list diff --git a/Utilities/Maintenance/RemoteModuleIngest/whitelists/SplitComponents.list b/Utilities/Maintenance/RemoteModuleIngest/whitelists/SplitComponents.list new file mode 100644 index 00000000000..4929b68cc47 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/whitelists/SplitComponents.list @@ -0,0 +1,21 @@ +# SplitComponents — files that migrate into ITK at Modules/Filtering/SplitComponents/ +# +# Format: one path per line, relative to the upstream repo root. +# Globs supported via filter-repo --paths-from-file. +# Lines starting with '#' are comments; blank lines ignored. + +# Public headers (SplitComponents is header-only; no src/) +include + +# Tests + fixtures +test + +# Python wrapping +wrapping + +# CMake / module metadata +CMakeLists.txt +itk-module.cmake + +# License (always migrates with the code) +LICENSE From b91669eca15d966a81d39e2695e742eb6fa950f9 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 08:45:36 -0500 Subject: [PATCH 11/15] =?UTF-8?q?ENH:=20Add=20v4=20IOMeshMZ3=20whitelist?= =?UTF-8?q?=20(mirrors=20IOMeshSTL=20=E2=80=94=20has=20include=20+=20src)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../whitelists/IOMeshMZ3.list | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshMZ3.list diff --git a/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshMZ3.list b/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshMZ3.list new file mode 100644 index 00000000000..b82b11dbb19 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOMeshMZ3.list @@ -0,0 +1,24 @@ +# IOMeshMZ3 — files that migrate into ITK at Modules/IO/IOMeshMZ3/ +# +# Format: one path per line, relative to the upstream repo root. +# Globs supported via filter-repo --paths-from-file. +# Lines starting with '#' are comments; blank lines ignored. + +# Public headers +include + +# Implementation sources +src + +# Tests + fixtures +test + +# Python wrapping +wrapping + +# CMake / module metadata +CMakeLists.txt +itk-module.cmake + +# License (always migrates with the code) +LICENSE From e9e5c5acc7b52bb1fc616c5894c39e7f48efb70c Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 08:59:48 -0500 Subject: [PATCH 12/15] =?UTF-8?q?ENH:=20Add=20v4=20IOFDF=20whitelist=20(mi?= =?UTF-8?q?rrors=20IOMeshSTL=20=E2=80=94=20has=20include=20+=20src)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../RemoteModuleIngest/whitelists/IOFDF.list | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/whitelists/IOFDF.list diff --git a/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOFDF.list b/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOFDF.list new file mode 100644 index 00000000000..7a032df7a29 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/whitelists/IOFDF.list @@ -0,0 +1,24 @@ +# IOFDF — files that migrate into ITK at Modules/IO/IOFDF/ +# +# Format: one path per line, relative to the upstream repo root. +# Globs supported via filter-repo --paths-from-file. +# Lines starting with '#' are comments; blank lines ignored. + +# Public headers +include + +# Implementation sources +src + +# Tests + fixtures +test + +# Python wrapping +wrapping + +# CMake / module metadata +CMakeLists.txt +itk-module.cmake + +# License (always migrates with the code) +LICENSE From 424f51049cd93ca00e61c1abfa11c940cd130530 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 11:08:02 -0500 Subject: [PATCH 13/15] =?UTF-8?q?COMP:=20v4=20sanitize=20=E2=80=94=20strip?= =?UTF-8?q?=20cmake=5Fminimum=5Frequired=20from=20module=20CMakeLists?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream remote modules typically begin their top-level CMakeLists.txt with their own cmake_minimum_required(VERSION X.Y.Z) declaration, often 3.10.2 or earlier. ITK's top-level CMakeLists pins a higher minimum, so the per-module line is redundant and frequently *lower* than ITK's floor. @dzenanz flagged it on every ingest with the comment 'Version behind, not needed.' — most recently on PR #6215 (IOFDF). Add patch_drop_cmake_minimum_required() that detects the idiom and removes the matching line via a multiline regex. Track count via sanitizer.cmake_min_required_drops and print it in the run summary. Apply alongside patch_standalone_build_guard() in the cmake-blob branch of blob_callback. --- .../RemoteModuleIngest/sanitize-history.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py index a23717e0354..70d100104d5 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py +++ b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py @@ -461,6 +461,30 @@ def patch_standalone_build_guard(data: bytes) -> tuple[bytes, bool]: return new_data, n > 0 +# Upstream module CMakeLists.txt frequently begin with their own +# cmake_minimum_required(VERSION X.Y.Z) declaration (often +# 3.10.2 or earlier). In-tree, ITK's top-level CMakeLists pins a +# higher minimum, so the per-module line is at best redundant and at +# worst conflicts with policy. Reviewers (e.g. @dzenanz on PR #6215 +# IOFDF) flag it on every ingest with the comment "Version behind, +# not needed." Strip the standalone declaration here so it is never +# present in the ingest history. +CMAKE_MIN_REQUIRED_RE = re.compile( + rb"^[ \t]*cmake_minimum_required\s*\([^)]*\)[ \t]*\n", + re.IGNORECASE | re.MULTILINE, +) + + +def patch_drop_cmake_minimum_required(data: bytes) -> tuple[bytes, bool]: + """Remove any ``cmake_minimum_required(...)`` line from a CMake blob. + Returns (new_data, changed). Safe to apply to every CMake blob: + only top-level module CMakeLists files normally carry this idiom, + and ITK does not want the per-module declaration anywhere in the + ingested tree.""" + new_data, n = CMAKE_MIN_REQUIRED_RE.subn(b"", data) + return new_data, n > 0 + + def fmt_cmake(data: bytes, gersemi_bin: str, gersemi_config: str | None) -> bytes: cmd = [gersemi_bin] if gersemi_config: @@ -513,6 +537,7 @@ def __init__( self.mode_normalizations = 0 self.readme_ref_patches = 0 self.standalone_guard_patches = 0 + self.cmake_min_required_drops = 0 self.blob_count = 0 self.format_changes = 0 self.kind_counts: dict[str, int] = {} @@ -538,6 +563,9 @@ def blob_callback(self, blob: fr.Blob, _metadata: dict) -> None: new_data, guard_unwrapped = patch_standalone_build_guard(original_data) if guard_unwrapped: self.standalone_guard_patches += 1 + new_data, cmake_min_dropped = patch_drop_cmake_minimum_required(new_data) + if cmake_min_dropped: + self.cmake_min_required_drops += 1 new_data = fmt_cmake(new_data, self.gersemi_bin, self.gersemi_config) new_data, readme_patched = patch_readme_reference(new_data) if readme_patched: @@ -718,6 +746,7 @@ def main() -> int: f" exec-bits cleared: {sanitizer.mode_normalizations}\n" f" README.rst->.md fixes: {sanitizer.readme_ref_patches}\n" f" standalone-guard rm: {sanitizer.standalone_guard_patches}\n" + f" cmake_min_required rm: {sanitizer.cmake_min_required_drops}\n" f" blobs scanned: {sanitizer.blob_count}\n" f" blobs reformatted: {sanitizer.format_changes}\n" f" by-kind sniff: " From 91976ca414905d404fec355a8457318cc20c924a Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Tue, 5 May 2026 11:08:07 -0500 Subject: [PATCH 14/15] =?UTF-8?q?DOC:=20v4=20strategy=20=E2=80=94=20add=20?= =?UTF-8?q?cmake=5Fminimum=5Frequired-stripper=20to=20auto-sanitize=20tabl?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md | 1 + 1 file changed, 1 insertion(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md index 67fb546139a..3b7d6e543a3 100644 --- a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md +++ b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md @@ -272,6 +272,7 @@ by the v4 sanitizer; operators do not need to fix them by hand: |---|---|---| | `**/.ExternalData_*` (e.g. `.ExternalData_MD5_` baseline cache) | `ingest-module-v4.sh` deny-pass | Local fetch-cache from upstream's CTest run; ExternalData regenerates from `.cid` / `.md5` sidecars on demand. (@dzenanz, PR #6206) | | `if(NOT ITK_SOURCE_DIR) ... find_package(ITK) ... else() itk_module_impl() endif()` standalone-build guard in module `CMakeLists.txt` | `sanitize-history.py:patch_standalone_build_guard` | In-tree, `ITK_SOURCE_DIR` is always defined; the if-branch is dead code. (@dzenanz, PR #6206) | +| `cmake_minimum_required(VERSION X.Y.Z)` line at the top of an ingested module's `CMakeLists.txt` | `sanitize-history.py:patch_drop_cmake_minimum_required` | ITK's top-level CMakeLists pins a higher minimum; per-module declarations are redundant and frequently **lower** than the ITK floor (3.10.2 is common upstream). (@dzenanz, PR #6215 IOFDF) | | `README.rst` references in CMake `file(READ ...)` calls | `sanitize-history.py:patch_readme_reference` | Phase B archival promotes `MIGRATION_README.md` to `README.md`; in-tree consumers read the markdown form | | `*.orig`, `*.rej`, `*.BACKUP.*`, `*.LOCAL.*`, `*.REMOTE.*`, `*.BASE.*` | deny-pass | Leftover merge-conflict artifacts | | Scaffolding (`Dockerfile*`, `azure-pipelines*.yml`, `.github/`, `.travis.yml`, `.circleci/`, `tox.ini`, `pyproject.toml`, `setup.py`, `.clang-format`, `.pre-commit-config.yaml`, …) | deny-pass | Module's per-repo CI/packaging is irrelevant in-tree | From c7768a5377aaa16593aa1f6ffdcb38a764d700e3 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Wed, 6 May 2026 09:57:38 -0500 Subject: [PATCH 15/15] COMP: Strip dynamic file(READ README.md DESCRIPTION) from itk-module.cmake Archival README.md files contain semicolons and bracket characters. CMake's foreach(arg \${ARGN}) splits on semicolons, so reading them into a DESCRIPTION argument produces CMake (dev) configure warnings for every module ingested via v4 (observed: RLEImage, SplitComponents, IOFDF, IOMeshMZ3, IOMeshSTL). Add patch_dynamic_description() to sanitize-history.py to strip the get_filename_component/file(READ README.md) preamble and replace DESCRIPTION "\${DOCUMENTATION}" with a static one-liner during history rewrite. Document the pattern in INGESTION_STRATEGY_v4.md. --- .../INGESTION_STRATEGY_v4.md | 1 + .../RemoteModuleIngest/sanitize-history.py | 45 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md index 3b7d6e543a3..2123c8fb686 100644 --- a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md +++ b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY_v4.md @@ -274,6 +274,7 @@ by the v4 sanitizer; operators do not need to fix them by hand: | `if(NOT ITK_SOURCE_DIR) ... find_package(ITK) ... else() itk_module_impl() endif()` standalone-build guard in module `CMakeLists.txt` | `sanitize-history.py:patch_standalone_build_guard` | In-tree, `ITK_SOURCE_DIR` is always defined; the if-branch is dead code. (@dzenanz, PR #6206) | | `cmake_minimum_required(VERSION X.Y.Z)` line at the top of an ingested module's `CMakeLists.txt` | `sanitize-history.py:patch_drop_cmake_minimum_required` | ITK's top-level CMakeLists pins a higher minimum; per-module declarations are redundant and frequently **lower** than the ITK floor (3.10.2 is common upstream). (@dzenanz, PR #6215 IOFDF) | | `README.rst` references in CMake `file(READ ...)` calls | `sanitize-history.py:patch_readme_reference` | Phase B archival promotes `MIGRATION_README.md` to `README.md`; in-tree consumers read the markdown form | +| `get_filename_component(...) / file(READ README.md DOCUMENTATION)` preamble + `DESCRIPTION "${DOCUMENTATION}"` in `itk-module.cmake` | `sanitize-history.py:patch_dynamic_description` | Archival `README.md` contains semicolons and `[` characters that CMake list expansion splits into spurious `itk_module()` arguments, producing `CMake Warning (dev): Unknown argument` on every configure (observed: RLEImage, SplitComponents, IOFDF, IOMeshMZ3, IOMeshSTL; fixed in ITK PR #6220) | | `*.orig`, `*.rej`, `*.BACKUP.*`, `*.LOCAL.*`, `*.REMOTE.*`, `*.BASE.*` | deny-pass | Leftover merge-conflict artifacts | | Scaffolding (`Dockerfile*`, `azure-pipelines*.yml`, `.github/`, `.travis.yml`, `.circleci/`, `tox.ini`, `pyproject.toml`, `setup.py`, `.clang-format`, `.pre-commit-config.yaml`, …) | deny-pass | Module's per-repo CI/packaging is irrelevant in-tree | diff --git a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py index 70d100104d5..fd91d62efeb 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py +++ b/Utilities/Maintenance/RemoteModuleIngest/sanitize-history.py @@ -422,6 +422,46 @@ def patch_readme_reference(data: bytes) -> tuple[bytes, bool]: return new_data, n > 0 +# After ingest, the in-tree README.md is the archival migration notice. +# It contains semicolons and bracket characters that CMake's +# foreach(arg ${ARGN}) expands into spurious arguments, triggering +# CMake Warning (dev): Unknown argument [...] +# in ITKModuleMacros.cmake for every configure. +# +# The pattern to eliminate: +# get_filename_component(MY_CURRENT_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +# file(READ "${MY_CURRENT_DIR}/README.md" DOCUMENTATION) +# ... +# DESCRIPTION "${DOCUMENTATION}" +# +# Replace the two preamble lines with nothing and the DESCRIPTION arg +# with a static one-liner. +_README_FILE_READ_RE = re.compile( + rb"[ \t]*(?:get_filename_component\s*\([^)]*\)\s*\n)?" + rb"[ \t]*file\s*\(\s*READ\s+[^\n]*README\.md[^\n]*\)\s*\n", + re.IGNORECASE, +) +_DOCUMENTATION_DESCRIPTION_RE = re.compile( + rb'([ \t]*DESCRIPTION\s+)"?\$\{DOCUMENTATION\}"?', + re.IGNORECASE, +) + + +def patch_dynamic_description(data: bytes) -> tuple[bytes, bool]: + """Remove the file(READ README.md DOCUMENTATION) preamble and replace + DESCRIPTION "${DOCUMENTATION}" with a static one-liner. + + The archival README.md contains semicolons and bracket characters that + CMake list expansion splits into spurious itk_module() arguments, + producing CMake (dev) configure warnings. Returns (new_data, changed). + """ + new_data, n1 = _README_FILE_READ_RE.subn(b"", data) + new_data, n2 = _DOCUMENTATION_DESCRIPTION_RE.subn( + rb'\1"Module ingested from upstream."', new_data + ) + return new_data, (n1 + n2) > 0 + + # Upstream remote modules wrap their top-level CMakeLists.txt in a # standalone-build guard so the same file works both as a # remote-fetched module and as a stand-alone CMake project: @@ -536,6 +576,7 @@ def __init__( self.subject_blank_inserts = 0 self.mode_normalizations = 0 self.readme_ref_patches = 0 + self.dynamic_description_patches = 0 self.standalone_guard_patches = 0 self.cmake_min_required_drops = 0 self.blob_count = 0 @@ -570,6 +611,9 @@ def blob_callback(self, blob: fr.Blob, _metadata: dict) -> None: new_data, readme_patched = patch_readme_reference(new_data) if readme_patched: self.readme_ref_patches += 1 + new_data, desc_patched = patch_dynamic_description(new_data) + if desc_patched: + self.dynamic_description_patches += 1 else: # 'text' new_data = original_data @@ -745,6 +789,7 @@ def main() -> int: f" blank lines inserted: {sanitizer.subject_blank_inserts}\n" f" exec-bits cleared: {sanitizer.mode_normalizations}\n" f" README.rst->.md fixes: {sanitizer.readme_ref_patches}\n" + f" dynamic DESCRIPTION rm:{sanitizer.dynamic_description_patches}\n" f" standalone-guard rm: {sanitizer.standalone_guard_patches}\n" f" cmake_min_required rm: {sanitizer.cmake_min_required_drops}\n" f" blobs scanned: {sanitizer.blob_count}\n"