From 21184d2439e5f5a2c756a2ea32a07364abd7825a Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 17:12:04 +0000
Subject: [PATCH 01/11] ci: cache docker_images() result within a single
 process

docker_images() shells out to `docker images` on every call. It is
called once per DependencySet construction (~34 times during a full
mkpipeline run). Adding @cache avoids redundant subprocess calls.

Measured on dev machine:
  34 uncached calls: 0.987s (29.0ms each)
  1 uncached + 33 cached: 0.000s
  Savings: ~0.96s

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/mzbuild.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index 120057e88ec0d..3ee3dd7e1a714 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -234,9 +234,10 @@ def rewrite_builder_path_for_host(self, path: Path) -> Path:
             return path
 
 
-def docker_images() -> set[str]:
+@cache
+def docker_images() -> frozenset[str]:
     """List the Docker images available on the local machine."""
-    return set(
+    return frozenset(
         spawn.capture(["docker", "images", "--format", "{{.Repository}}:{{.Tag}}"])
         .strip()
         .split("\n")

From c4d29b29ed492336a8c1dd6557cbeb4f7fdac1d4 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 21:53:13 +0000
Subject: [PATCH 02/11] ci: use munge_services=False in trim_tests_pipeline

trim_tests_pipeline loads each composition with munge_services=True to
discover image dependencies. This triggers expensive fingerprinting and
dependency resolution for every composition. Since we only need to know
which mzbuild images a composition references (not their fingerprints),
use munge_services=False and extract image names directly from the
service configs.

Measured on dev machine (36 compositions):
  munge_services=True:  5.82s
  munge_services=False: 2.95s
  Savings: 2.87s (2x speedup)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ci/mkpipeline.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py
index 9acdabb5859dc..543cc8fa73730 100644
--- a/ci/mkpipeline.py
+++ b/ci/mkpipeline.py
@@ -718,8 +718,25 @@ def trim_tests_pipeline(
             files = future.result()
             imported_files[path] = files
 
+    # Cache compositions loaded with munge_services=False to extract image
+    # names from their service configs. This avoids expensive fingerprinting
+    # and dependency resolution that munge_services=True triggers.
     compositions: dict[str, Composition] = {}
 
+    def get_composition_image_deps(
+        name: str,
+    ) -> list[mzbuild.ResolvedImage]:
+        """Get the mzbuild image dependencies for a composition without
+        doing expensive fingerprinting/dependency resolution."""
+        if name not in compositions:
+            compositions[name] = Composition(repo, name, munge_services=False)
+        comp = compositions[name]
+        image_names = []
+        for _svc_name, config in comp.compose.get("services", {}).items():
+            if "mzbuild" in config:
+                image_names.append(config["mzbuild"])
+        return [deps[img_name] for img_name in image_names if img_name in deps]
+
     def to_step(config: dict[str, Any]) -> PipelineStep | None:
         if "wait" in config or "group" in config:
             return None
@@ -740,9 +757,7 @@ def to_step(config: dict[str, Any]) -> PipelineStep | None:
                 for plugin_name, plugin_config in plugin.items():
                     if plugin_name == "./ci/plugins/mzcompose":
                         name = plugin_config["composition"]
-                        if name not in compositions:
-                            compositions[name] = Composition(repo, name)
-                        for dep in compositions[name].dependencies:
+                        for dep in get_composition_image_deps(name):
                             step.image_dependencies.add(dep)
                         composition_path = str(repo.compositions[name])
                         step.extra_inputs.add(composition_path)

From 827672e280a6a4e4056f196b970d72636d60b2d1 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 21:54:11 +0000
Subject: [PATCH 03/11] ci: skip munge_services for list-workflows command

list-workflows only needs to enumerate workflow function names from the
mzcompose.py module. It does not need resolved image specs or
fingerprints. Pass munge_services=False to skip the expensive
dependency resolution.

This is called once per CI step from the mzcompose plugin hook.

Measured on dev machine (cluster composition):
  munge_services=True:  2.454s
  munge_services=False: 0.075s
  Savings: 2.379s per invocation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/cli/mzcompose.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/misc/python/materialize/cli/mzcompose.py b/misc/python/materialize/cli/mzcompose.py
index b09be00eb037b..f3a5b8be519c1 100644
--- a/misc/python/materialize/cli/mzcompose.py
+++ b/misc/python/materialize/cli/mzcompose.py
@@ -166,7 +166,9 @@ def main(argv: list[str]) -> None:
     args.command.invoke(args)
 
 
-def load_composition(args: argparse.Namespace) -> Composition:
+def load_composition(
+    args: argparse.Namespace, munge_services: bool = True
+) -> Composition:
     """Loads the composition specified by the command-line arguments."""
     if not args.ignore_docker_version:
         docker_local_version = Version.parse(
@@ -205,6 +207,7 @@ def load_composition(args: argparse.Namespace) -> Composition:
             project_name=args.project_name,
             sanity_restart_mz=args.sanity_restart_mz,
             host_network=args.host_network,
+            munge_services=munge_services,
         )
     except UnknownCompositionError as e:
         if args.find:
@@ -335,7 +338,7 @@ class ListWorkflowsCommand(Command):
     help = "list workflows in the composition"
 
     def run(self, args: argparse.Namespace) -> None:
-        composition = load_composition(args)
+        composition = load_composition(args, munge_services=False)
         for name in sorted(composition.workflows):
             print(name)
 

From bc42a489970df94591a5024a98d43a07b82ab35d Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 21:54:53 +0000
Subject: [PATCH 04/11] ci: parallelize x86_64/aarch64 dependency resolution in
 fetch_hashes

fetch_hashes resolves dependencies for both architectures sequentially,
each involving expensive file fingerprinting. Since the two arch builds
are completely independent, resolve them in parallel using
ThreadPoolExecutor.

Measured on dev machine:
  Sequential: 8.06s
  Parallel:   1.90s
  Savings:    6.16s (4.2x speedup)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ci/mkpipeline.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py
index 543cc8fa73730..c2c3a74ac53ee 100644
--- a/ci/mkpipeline.py
+++ b/ci/mkpipeline.py
@@ -137,8 +137,16 @@ def get_hashes(arch: Arch) -> tuple[str, bool]:
         return (hash(deps), check)
 
     def fetch_hashes() -> None:
-        for arch in [Arch.AARCH64, Arch.X86_64]:
-            hash_check[arch] = get_hashes(arch)
+        # Resolve both architectures in parallel since they are independent
+        # and each involves expensive fingerprinting.
+        with ThreadPoolExecutor(max_workers=2) as pool:
+            futures = {
+                pool.submit(get_hashes, arch): arch
+                for arch in [Arch.AARCH64, Arch.X86_64]
+            }
+            for future in futures:
+                arch = futures[future]
+                hash_check[arch] = future.result()
 
     trim_builds_prep_thread = threading.Thread(target=fetch_hashes)
     trim_builds_prep_thread.start()

From 21e65586fc6ea219853d05fc24ef2e841d5a9075 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 22:00:25 +0000
Subject: [PATCH 05/11] ci: skip munge_services for description command

The `mzcompose description` command only needs the module docstring,
which is available without expensive dependency resolution and
fingerprinting. Use munge_services=False to skip that work.

This is called once per CI step via the mzcompose buildkite plugin
command hook (line 100: `TEST_DESC="$(mzcompose description)"`).

Measured savings: ~2.5s per `mzcompose description` call
  munge_services=True:  2.55s (full load_composition)
  munge_services=False: 0.23s (full load_composition)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/cli/mzcompose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misc/python/materialize/cli/mzcompose.py b/misc/python/materialize/cli/mzcompose.py
index f3a5b8be519c1..8e74c447b925c 100644
--- a/misc/python/materialize/cli/mzcompose.py
+++ b/misc/python/materialize/cli/mzcompose.py
@@ -390,7 +390,7 @@ class DescriptionCommand(Command):
     help = "fetch the Python code description from mzcompose.py"
 
     def run(self, args: argparse.Namespace) -> None:
-        composition = load_composition(args)
+        composition = load_composition(args, munge_services=False)
         print(composition.description)
 
 

From 7ef7497c52f22394ab5ff1f57b8d1c5e0c8ddbf5 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 22:03:33 +0000
Subject: [PATCH 06/11] ci: skip munge_services for describe command

The `mzcompose describe` (aka `ls`/`list`) command only displays service
names, workflow names/docstrings, and the composition description. All of
this data is available without expensive dependency resolution and
fingerprinting.

This is primarily a local development speedup since describe is not
called in CI, but it makes `mzcompose ls` much more responsive.

Measured savings: ~2.5s per `mzcompose describe` call
  munge_services=True:  2.78s (full load_composition)
  munge_services=False: 0.25s (full load_composition)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/cli/mzcompose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misc/python/materialize/cli/mzcompose.py b/misc/python/materialize/cli/mzcompose.py
index 8e74c447b925c..b29e072cd254d 100644
--- a/misc/python/materialize/cli/mzcompose.py
+++ b/misc/python/materialize/cli/mzcompose.py
@@ -349,7 +349,7 @@ class DescribeCommand(Command):
     help = "describe services and workflows in the composition"
 
     def run(self, args: argparse.Namespace) -> None:
-        composition = load_composition(args)
+        composition = load_composition(args, munge_services=False)
 
         workflows = []
         for name, fn in composition.workflows.items():

From ef534de64499121d499f6ffada58aa5b67a3c0b2 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 22:14:12 +0000
Subject: [PATCH 07/11] ci: batch crate input file discovery into a single git
 call

When resolving image dependencies, each Rust crate's input files were
discovered via individual `git diff` + `git ls-files` subprocess calls.
With ~118 crates across the workspace, this meant ~236 subprocess calls
just for crate file enumeration.

Add Workspace.precompute_crate_inputs() which does a single pair of
git calls to discover all crate files at once, then partitions the
results by crate path in Python. This is called automatically at the
start of resolve_dependencies().

Measured savings for resolve_dependencies(all 41 images):
  Before: 4.80s
  After:  2.84s
  Savings: 1.96s (41%)

Measured savings for single composition (pg-cdc, munge_services=True):
  Before: 2.57s
  After:  0.78s
  Savings: 1.80s (70%)

This benefits every `mzcompose up` and `mzcompose run` call in CI.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/cargo.py   | 46 ++++++++++++++++++++++++++++++
 misc/python/materialize/mzbuild.py |  4 +++
 2 files changed, 50 insertions(+)

diff --git a/misc/python/materialize/cargo.py b/misc/python/materialize/cargo.py
index 7399f63553f46..0e5d7e378f35d 100644
--- a/misc/python/materialize/cargo.py
+++ b/misc/python/materialize/cargo.py
@@ -116,6 +116,8 @@ def inputs(self) -> set[str]:
         # † As a development convenience, we omit mzcompose configuration files
         # within a crate. This is technically incorrect if someone writes
         # `include!("mzcompose.py")`, but that seems like a crazy thing to do.
+        if hasattr(self, "_inputs_cache"):
+            return self._inputs_cache
         return git.expand_globs(
             self.root,
             f"{self.path}/**",
@@ -245,3 +247,47 @@ def visit(c: Crate) -> None:
             for d in crate.path_dev_dependencies:
                 visit(self.crates[d])
         return deps
+
+    def precompute_crate_inputs(self) -> None:
+        """Pre-fetch all crate input files in a single batched git call.
+
+        This replaces ~118 individual pairs of git subprocess calls with
+        a single pair, then partitions the results by crate path in Python.
+        """
+        from materialize import spawn
+
+        crate_paths = sorted(set(str(c.path) for c in self.all_crates.values()))
+
+        specs = []
+        for p in crate_paths:
+            specs.append(f"{p}/**")
+            specs.append(f":(exclude){p}/mzcompose")
+            specs.append(f":(exclude){p}/mzcompose.py")
+
+        root = next(iter(self.all_crates.values())).root
+        empty_tree = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
+        diff_files = spawn.capture(
+            ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"]
+            + specs,
+            cwd=root,
+        )
+        ls_files = spawn.capture(
+            ["git", "ls-files", "--others", "--exclude-standard", "-z", "--"] + specs,
+            cwd=root,
+        )
+        all_files = set(
+            f for f in (diff_files + ls_files).split("\0") if f.strip() != ""
+        )
+
+        # Partition files by crate path (longest match first for nested crates)
+        crate_file_map: dict[str, set[str]] = {p: set() for p in crate_paths}
+        sorted_paths = sorted(crate_paths, key=len, reverse=True)
+        for f in all_files:
+            for cp in sorted_paths:
+                if f.startswith(cp + "/"):
+                    crate_file_map[cp].add(f)
+                    break
+
+        # Inject cached results into each Crate object
+        for crate in self.all_crates.values():
+            crate._inputs_cache = crate_file_map.get(str(crate.path), set())
diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index 3ee3dd7e1a714..d7f2417a50712 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -1487,6 +1487,10 @@ def resolve_dependencies(self, targets: Iterable[Image]) -> DependencySet:
            ValueError: A circular dependency was discovered in the images
                in the repository.
         """
+        # Pre-fetch all crate input files in a single batched git call,
+        # replacing ~118 individual subprocess pairs with one pair.
+        self.rd.cargo_workspace.precompute_crate_inputs()
+
         resolved = OrderedDict()
         visiting = set()
 

From 67b268f6b27654e5c00c41f1b6b316273ff3b8e8 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 22:18:48 +0000
Subject: [PATCH 08/11] ci: batch image context file discovery into a single
 git call

When computing image fingerprints, each image's context files were
discovered via individual `git diff` + `git ls-files` subprocess calls.
With 41 images, this meant 82 subprocess calls just for image context
enumeration.

Add Repository._precompute_image_context_files() which does a single
pair of git calls to discover all image context files at once, then
partitions results by image path. This is called automatically at the
start of resolve_dependencies().

Combined with the crate input batching from the previous commit:

Measured savings for resolve_dependencies(all 41 images):
  Before (no batching):       4.17s
  After (both batched):       1.85s
  Savings: 2.32s (56%)

Measured savings for single composition (pg-cdc, munge_services=True):
  Before: 2.43s
  After:  0.64s
  Savings: 1.79s (74%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/mzbuild.py | 52 +++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index d7f2417a50712..f92e28f990a92 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -1067,7 +1067,10 @@ def inputs(self, transitive: bool = False) -> set[str]:
             inputs: A list of input files, relative to the root of the
                 repository.
         """
-        paths = set(git.expand_globs(self.image.rd.root, f"{self.image.path}/**"))
+        if hasattr(self.image, "_context_files_cache"):
+            paths = set(self.image._context_files_cache)
+        else:
+            paths = set(git.expand_globs(self.image.rd.root, f"{self.image.path}/**"))
         if not paths:
             # While we could find an `mzbuild.yml` file for this service, expland_globs didn't
             # return any files that matched this service. At the very least, the `mzbuild.yml`
@@ -1490,6 +1493,9 @@ def resolve_dependencies(self, targets: Iterable[Image]) -> DependencySet:
         # Pre-fetch all crate input files in a single batched git call,
         # replacing ~118 individual subprocess pairs with one pair.
         self.rd.cargo_workspace.precompute_crate_inputs()
+        # Pre-fetch all image context files in a single batched git call,
+        # replacing ~41 individual subprocess pairs with one pair.
+        self._precompute_image_context_files()
 
         resolved = OrderedDict()
         visiting = set()
@@ -1511,6 +1517,50 @@ def visit(image: Image, path: list[str] = []) -> None:
 
         return DependencySet(resolved.values())
 
+    def _precompute_image_context_files(self) -> None:
+        """Pre-fetch all image context files in a single batched git call.
+
+        This replaces ~41 individual pairs of git subprocess calls (one per
+        image) with a single pair, then partitions the results by image path.
+        The results are injected into the expand_globs cache.
+        """
+        image_paths = sorted(set(str(img.path) for img in self.images.values()))
+        specs = [f"{p}/**" for p in image_paths]
+
+        root = self.rd.root
+        empty_tree = (
+            "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
+        )
+        diff_files = spawn.capture(
+            ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"]
+            + specs,
+            cwd=root,
+        )
+        ls_files = spawn.capture(
+            ["git", "ls-files", "--others", "--exclude-standard", "-z", "--"] + specs,
+            cwd=root,
+        )
+        all_files = set(
+            f for f in (diff_files + ls_files).split("\0") if f.strip() != ""
+        )
+
+        # Partition files by image path (longest match first for nested paths)
+        image_file_map: dict[str, set[str]] = {p: set() for p in image_paths}
+        sorted_paths = sorted(image_paths, key=len, reverse=True)
+        for f in all_files:
+            for ip in sorted_paths:
+                if f.startswith(ip + "/") or f.startswith(str(Path(ip)) + "/"):
+                    image_file_map[ip].add(f)
+                    break
+
+        # Inject results into the expand_globs cache by calling it with
+        # the same arguments that ResolvedImage.inputs() will use.
+        # Since expand_globs is @functools.cache, we need to populate the
+        # cache with the right keys. We do this by replacing the function
+        # temporarily to return our pre-computed results.
+        for img in self.images.values():
+            img._context_files_cache = image_file_map.get(str(img.path), set())
+
     def __iter__(self) -> Iterator[Image]:
         return iter(self.images.values())
 

From 3a2433a68c498c8e75ea91f0b3987caf1b96ca3a Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 22:22:24 +0000
Subject: [PATCH 09/11] ci: eliminate remaining git subprocess calls from
 fingerprinting

Two changes eliminate the remaining ~82 git subprocess calls from the
fingerprinting path:

1. CargoPreImage.inputs() now resolves its hardcoded inputs eagerly:
   - 'ci/builder' directory is expanded to individual files via a single
     cached expand_globs call
   - '.cargo/config' is included only if it exists
   - Result is cached with @cache since it's the same for all images

2. ResolvedImage.fingerprint() skips the expand_globs verification
   pass when precomputed data is available, since all inputs are
   already individual file paths from git.

This eliminates all git subprocess calls from resolve_dependencies,
reducing the total from ~384 calls (baseline) to just 5 (2 for crate
batch + 2 for image batch + 1 for ci/builder).

Measured savings for resolve_dependencies(all 41 images):
  Before (no batching):  4.15s
  After (all batching):  0.40s
  Savings: 3.75s (90%)

Measured savings for single composition (pg-cdc, munge_services=True):
  Before: 2.23s
  After:  0.26s
  Savings: 1.97s (88%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/mzbuild.py | 43 ++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index f92e28f990a92..dd3e2d60e6d3f 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -468,18 +468,25 @@ def inputs(self) -> set[str]:
 class CargoPreImage(PreImage):
     """A `PreImage` action that uses Cargo."""
 
-    def inputs(self) -> set[str]:
-        inputs = {
-            "ci/builder",
-            "Cargo.toml",
-            # TODO(benesch): we could in theory fingerprint only the subset of
-            # Cargo.lock that applies to the crates at hand, but that is a
-            # *lot* of work.
-            "Cargo.lock",
-            ".cargo/config",
-        }
+    @staticmethod
+    @cache
+    def _cargo_shared_inputs() -> frozenset[str]:
+        """Resolve shared Cargo inputs once and cache the result.
 
-        return inputs
+        This expands the 'ci/builder' directory glob and filters out
+        non-existent files like '.cargo/config', avoiding repeated
+        git subprocess calls in fingerprint().
+        """
+        inputs: set[str] = set()
+        inputs |= git.expand_globs(Path("."), "ci/builder/**")
+        inputs.add("Cargo.toml")
+        inputs.add("Cargo.lock")
+        if Path(".cargo/config").exists():
+            inputs.add(".cargo/config")
+        return frozenset(inputs)
+
+    def inputs(self) -> set[str]:
+        return set(CargoPreImage._cargo_shared_inputs())
 
     def extra(self) -> str:
         # Cargo images depend on the release mode and whether
@@ -1098,9 +1105,17 @@ def fingerprint(self) -> Fingerprint:
         inputs via `PreImage.inputs`.
         """
         self_hash = hashlib.sha1()
-        for rel_path in sorted(
-            set(git.expand_globs(self.image.rd.root, *self.inputs()))
-        ):
+        # When inputs come from precomputed sources (crate and image context
+        # batching + resolved CargoPreImage paths), they are already individual
+        # file paths from git. Skip the expensive expand_globs subprocess calls.
+        inputs = self.inputs()
+        if hasattr(self.image, "_context_files_cache"):
+            resolved_inputs = sorted(inputs)
+        else:
+            resolved_inputs = sorted(
+                set(git.expand_globs(self.image.rd.root, *inputs))
+            )
+        for rel_path in resolved_inputs:
             abs_path = self.image.rd.root / rel_path
             file_hash = hashlib.sha1()
             raw_file_mode = os.lstat(abs_path).st_mode

From 8b621ccd1759724719e1e6b7a45ea67a87345c72 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 22:36:16 +0000
Subject: [PATCH 10/11] ci: fix batched file discovery with absolute root paths

The crate and image context precomputation used str(path) directly for
both git pathspecs and the file partitioning step. This works when the
repository root is a relative path (Path(".")), but fails when it's an
absolute path (as it is when MZ_ROOT is set in CI via mzcompose).

The issue: git --relative outputs paths relative to cwd, but the
partition logic compared these relative paths against potentially
absolute image/crate paths, causing the startswith() check to fail.
This left _context_files_cache empty, triggering the "files are
unknown to git" assertion.

Fix: use path.relative_to(root) to normalize all paths before
constructing git specs and partitioning results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/cargo.py   | 18 +++++++++++------
 misc/python/materialize/mzbuild.py | 32 +++++++++++++-----------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/misc/python/materialize/cargo.py b/misc/python/materialize/cargo.py
index 0e5d7e378f35d..3dde480ba0869 100644
--- a/misc/python/materialize/cargo.py
+++ b/misc/python/materialize/cargo.py
@@ -256,15 +256,20 @@ def precompute_crate_inputs(self) -> None:
         """
         from materialize import spawn
 
-        crate_paths = sorted(set(str(c.path) for c in self.all_crates.values()))
+        root = next(iter(self.all_crates.values())).root
+        # Use paths relative to root for git specs and partitioning, since
+        # git --relative outputs paths relative to cwd (root). Crate paths
+        # may be absolute when MZ_ROOT is an absolute path.
+        crate_rel_paths = sorted(
+            set(str(c.path.relative_to(root)) for c in self.all_crates.values())
+        )
 
         specs = []
-        for p in crate_paths:
+        for p in crate_rel_paths:
             specs.append(f"{p}/**")
             specs.append(f":(exclude){p}/mzcompose")
             specs.append(f":(exclude){p}/mzcompose.py")
 
-        root = next(iter(self.all_crates.values())).root
         empty_tree = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
         diff_files = spawn.capture(
             ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"]
@@ -280,8 +285,8 @@ def precompute_crate_inputs(self) -> None:
         )
 
         # Partition files by crate path (longest match first for nested crates)
-        crate_file_map: dict[str, set[str]] = {p: set() for p in crate_paths}
-        sorted_paths = sorted(crate_paths, key=len, reverse=True)
+        crate_file_map: dict[str, set[str]] = {p: set() for p in crate_rel_paths}
+        sorted_paths = sorted(crate_rel_paths, key=len, reverse=True)
         for f in all_files:
             for cp in sorted_paths:
                 if f.startswith(cp + "/"):
@@ -290,4 +295,5 @@ def precompute_crate_inputs(self) -> None:
 
         # Inject cached results into each Crate object
         for crate in self.all_crates.values():
-            crate._inputs_cache = crate_file_map.get(str(crate.path), set())
+            rel = str(crate.path.relative_to(root))
+            crate._inputs_cache = crate_file_map.get(rel, set())
diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index dd3e2d60e6d3f..401fa11983394 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -1112,9 +1112,7 @@ def fingerprint(self) -> Fingerprint:
         if hasattr(self.image, "_context_files_cache"):
             resolved_inputs = sorted(inputs)
         else:
-            resolved_inputs = sorted(
-                set(git.expand_globs(self.image.rd.root, *inputs))
-            )
+            resolved_inputs = sorted(set(git.expand_globs(self.image.rd.root, *inputs)))
         for rel_path in resolved_inputs:
             abs_path = self.image.rd.root / rel_path
             file_hash = hashlib.sha1()
@@ -1537,15 +1535,17 @@ def _precompute_image_context_files(self) -> None:
 
         This replaces ~41 individual pairs of git subprocess calls (one per
         image) with a single pair, then partitions the results by image path.
-        The results are injected into the expand_globs cache.
         """
-        image_paths = sorted(set(str(img.path) for img in self.images.values()))
-        specs = [f"{p}/**" for p in image_paths]
-
         root = self.rd.root
-        empty_tree = (
-            "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
+        # Use paths relative to root for git specs and partitioning, since
+        # git --relative outputs paths relative to cwd (root). Image paths
+        # may be absolute when MZ_ROOT is an absolute path.
+        image_rel_paths = sorted(
+            set(str(img.path.relative_to(root)) for img in self.images.values())
         )
+        specs = [f"{p}/**" for p in image_rel_paths]
+
+        empty_tree = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
         diff_files = spawn.capture(
             ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"]
             + specs,
@@ -1560,21 +1560,17 @@ def _precompute_image_context_files(self) -> None:
         )
 
         # Partition files by image path (longest match first for nested paths)
-        image_file_map: dict[str, set[str]] = {p: set() for p in image_paths}
-        sorted_paths = sorted(image_paths, key=len, reverse=True)
+        image_file_map: dict[str, set[str]] = {p: set() for p in image_rel_paths}
+        sorted_paths = sorted(image_rel_paths, key=len, reverse=True)
         for f in all_files:
             for ip in sorted_paths:
-                if f.startswith(ip + "/") or f.startswith(str(Path(ip)) + "/"):
+                if f.startswith(ip + "/"):
                     image_file_map[ip].add(f)
                     break
 
-        # Inject results into the expand_globs cache by calling it with
-        # the same arguments that ResolvedImage.inputs() will use.
-        # Since expand_globs is @functools.cache, we need to populate the
-        # cache with the right keys. We do this by replacing the function
-        # temporarily to return our pre-computed results.
         for img in self.images.values():
-            img._context_files_cache = image_file_map.get(str(img.path), set())
+            rel = str(img.path.relative_to(root))
+            img._context_files_cache = image_file_map.get(rel, set())
 
     def __iter__(self) -> Iterator[Image]:
         return iter(self.images.values())

From 710a3c958d582ce3b181f985d805b1c68eddf219 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 13 Feb 2026 23:03:55 +0000
Subject: [PATCH 11/11] ci: declare cache attributes as class fields for
 pyright

Pyright's type checker requires attributes to be declared on the class.
Declare _inputs_cache on Crate and _context_files_cache on Image as
Optional[set[str]] fields, initialized to None in __init__, and replace
hasattr() checks with `is not None` comparisons.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 misc/python/materialize/cargo.py   | 5 ++++-
 misc/python/materialize/mzbuild.py | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/misc/python/materialize/cargo.py b/misc/python/materialize/cargo.py
index 3dde480ba0869..e6bb51b307641 100644
--- a/misc/python/materialize/cargo.py
+++ b/misc/python/materialize/cargo.py
@@ -48,8 +48,11 @@ class Crate:
         examples: The names of all examples in the crate.
     """
 
+    _inputs_cache: set[str] | None
+
     def __init__(self, root: Path, path: Path):
         self.root = root
+        self._inputs_cache = None
         with open(path / "Cargo.toml") as f:
             config = toml.load(f)
         self.name = config["package"]["name"]
@@ -116,7 +119,7 @@ def inputs(self) -> set[str]:
         # † As a development convenience, we omit mzcompose configuration files
         # within a crate. This is technically incorrect if someone writes
         # `include!("mzcompose.py")`, but that seems like a crazy thing to do.
-        if hasattr(self, "_inputs_cache"):
+        if self._inputs_cache is not None:
             return self._inputs_cache
         return git.expand_globs(
             self.root,
diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index 401fa11983394..c7e2d3daba387 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -758,9 +758,12 @@ class Image:
 
     _DOCKERFILE_MZFROM_RE = re.compile(rb"^MZFROM\s*(\S+)")
 
+    _context_files_cache: set[str] | None
+
     def __init__(self, rd: RepositoryDetails, path: Path):
         self.rd = rd
         self.path = path
+        self._context_files_cache = None
         self.pre_images: list[PreImage] = []
         with open(self.path / "mzbuild.yml") as f:
             data = yaml.safe_load(f)
@@ -1074,7 +1077,7 @@ def inputs(self, transitive: bool = False) -> set[str]:
             inputs: A list of input files, relative to the root of the
                 repository.
         """
-        if hasattr(self.image, "_context_files_cache"):
+        if self.image._context_files_cache is not None:
             paths = set(self.image._context_files_cache)
         else:
             paths = set(git.expand_globs(self.image.rd.root, f"{self.image.path}/**"))
@@ -1109,7 +1112,7 @@ def fingerprint(self) -> Fingerprint:
         # batching + resolved CargoPreImage paths), they are already individual
         # file paths from git. Skip the expensive expand_globs subprocess calls.
         inputs = self.inputs()
-        if hasattr(self.image, "_context_files_cache"):
+        if self.image._context_files_cache is not None:
             resolved_inputs = sorted(inputs)
         else:
             resolved_inputs = sorted(set(git.expand_globs(self.image.rd.root, *inputs)))