From 21184d2439e5f5a2c756a2ea32a07364abd7825a Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 17:12:04 +0000 Subject: [PATCH 01/11] ci: cache docker_images() result within a single process docker_images() shells out to `docker images` on every call. It is called once per DependencySet construction (~34 times during a full mkpipeline run). Adding @cache avoids redundant subprocess calls. Measured on dev machine: 34 uncached calls: 0.987s (29.0ms each) 1 uncached + 33 cached: 0.000s Savings: ~0.96s Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/mzbuild.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index 120057e88ec0d..3ee3dd7e1a714 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -234,9 +234,10 @@ def rewrite_builder_path_for_host(self, path: Path) -> Path: return path -def docker_images() -> set[str]: +@cache +def docker_images() -> frozenset[str]: """List the Docker images available on the local machine.""" - return set( + return frozenset( spawn.capture(["docker", "images", "--format", "{{.Repository}}:{{.Tag}}"]) .strip() .split("\n") From c4d29b29ed492336a8c1dd6557cbeb4f7fdac1d4 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 21:53:13 +0000 Subject: [PATCH 02/11] ci: use munge_services=False in trim_tests_pipeline trim_tests_pipeline loads each composition with munge_services=True to discover image dependencies. This triggers expensive fingerprinting and dependency resolution for every composition. Since we only need to know which mzbuild images a composition references (not their fingerprints), use munge_services=False and extract image names directly from the service configs. Measured on dev machine (36 compositions): munge_services=True: 5.82s munge_services=False: 2.95s Savings: 2.87s (2x speedup) Co-Authored-By: Claude Opus 4.6 --- ci/mkpipeline.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py index 9acdabb5859dc..543cc8fa73730 100644 --- a/ci/mkpipeline.py +++ b/ci/mkpipeline.py @@ -718,8 +718,25 @@ def trim_tests_pipeline( files = future.result() imported_files[path] = files + # Cache compositions loaded with munge_services=False to extract image + # names from their service configs. This avoids expensive fingerprinting + # and dependency resolution that munge_services=True triggers. compositions: dict[str, Composition] = {} + def get_composition_image_deps( + name: str, + ) -> list[mzbuild.ResolvedImage]: + """Get the mzbuild image dependencies for a composition without + doing expensive fingerprinting/dependency resolution.""" + if name not in compositions: + compositions[name] = Composition(repo, name, munge_services=False) + comp = compositions[name] + image_names = [] + for _svc_name, config in comp.compose.get("services", {}).items(): + if "mzbuild" in config: + image_names.append(config["mzbuild"]) + return [deps[img_name] for img_name in image_names if img_name in deps] + def to_step(config: dict[str, Any]) -> PipelineStep | None: if "wait" in config or "group" in config: return None @@ -740,9 +757,7 @@ def to_step(config: dict[str, Any]) -> PipelineStep | None: for plugin_name, plugin_config in plugin.items(): if plugin_name == "./ci/plugins/mzcompose": name = plugin_config["composition"] - if name not in compositions: - compositions[name] = Composition(repo, name) - for dep in compositions[name].dependencies: + for dep in get_composition_image_deps(name): step.image_dependencies.add(dep) composition_path = str(repo.compositions[name]) step.extra_inputs.add(composition_path) From 827672e280a6a4e4056f196b970d72636d60b2d1 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 21:54:11 +0000 Subject: [PATCH 03/11] ci: skip munge_services for list-workflows command list-workflows only needs to enumerate workflow function names from the mzcompose.py module. It does not need resolved image specs or fingerprints. Pass munge_services=False to skip the expensive dependency resolution. This is called once per CI step from the mzcompose plugin hook. Measured on dev machine (cluster composition): munge_services=True: 2.454s munge_services=False: 0.075s Savings: 2.379s per invocation Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/cli/mzcompose.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/misc/python/materialize/cli/mzcompose.py b/misc/python/materialize/cli/mzcompose.py index b09be00eb037b..f3a5b8be519c1 100644 --- a/misc/python/materialize/cli/mzcompose.py +++ b/misc/python/materialize/cli/mzcompose.py @@ -166,7 +166,9 @@ def main(argv: list[str]) -> None: args.command.invoke(args) -def load_composition(args: argparse.Namespace) -> Composition: +def load_composition( + args: argparse.Namespace, munge_services: bool = True +) -> Composition: """Loads the composition specified by the command-line arguments.""" if not args.ignore_docker_version: docker_local_version = Version.parse( @@ -205,6 +207,7 @@ def load_composition(args: argparse.Namespace) -> Composition: project_name=args.project_name, sanity_restart_mz=args.sanity_restart_mz, host_network=args.host_network, + munge_services=munge_services, ) except UnknownCompositionError as e: if args.find: @@ -335,7 +338,7 @@ class ListWorkflowsCommand(Command): help = "list workflows in the composition" def run(self, args: argparse.Namespace) -> None: - composition = load_composition(args) + composition = load_composition(args, munge_services=False) for name in sorted(composition.workflows): print(name) From bc42a489970df94591a5024a98d43a07b82ab35d Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 21:54:53 +0000 Subject: [PATCH 04/11] ci: parallelize x86_64/aarch64 dependency resolution in fetch_hashes fetch_hashes resolves dependencies for both architectures sequentially, each involving expensive file fingerprinting. Since the two arch builds are completely independent, resolve them in parallel using ThreadPoolExecutor. Measured on dev machine: Sequential: 8.06s Parallel: 1.90s Savings: 6.16s (4.2x speedup) Co-Authored-By: Claude Opus 4.6 --- ci/mkpipeline.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py index 543cc8fa73730..c2c3a74ac53ee 100644 --- a/ci/mkpipeline.py +++ b/ci/mkpipeline.py @@ -137,8 +137,16 @@ def get_hashes(arch: Arch) -> tuple[str, bool]: return (hash(deps), check) def fetch_hashes() -> None: - for arch in [Arch.AARCH64, Arch.X86_64]: - hash_check[arch] = get_hashes(arch) + # Resolve both architectures in parallel since they are independent + # and each involves expensive fingerprinting. + with ThreadPoolExecutor(max_workers=2) as pool: + futures = { + pool.submit(get_hashes, arch): arch + for arch in [Arch.AARCH64, Arch.X86_64] + } + for future in futures: + arch = futures[future] + hash_check[arch] = future.result() trim_builds_prep_thread = threading.Thread(target=fetch_hashes) trim_builds_prep_thread.start() From 21e65586fc6ea219853d05fc24ef2e841d5a9075 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 22:00:25 +0000 Subject: [PATCH 05/11] ci: skip munge_services for description command The `mzcompose description` command only needs the module docstring, which is available without expensive dependency resolution and fingerprinting. Use munge_services=False to skip that work. This is called once per CI step via the mzcompose buildkite plugin command hook (line 100: `TEST_DESC="$(mzcompose description)"`). Measured savings: ~2.5s per `mzcompose description` call munge_services=True: 2.55s (full load_composition) munge_services=False: 0.23s (full load_composition) Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/cli/mzcompose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/python/materialize/cli/mzcompose.py b/misc/python/materialize/cli/mzcompose.py index f3a5b8be519c1..8e74c447b925c 100644 --- a/misc/python/materialize/cli/mzcompose.py +++ b/misc/python/materialize/cli/mzcompose.py @@ -390,7 +390,7 @@ class DescriptionCommand(Command): help = "fetch the Python code description from mzcompose.py" def run(self, args: argparse.Namespace) -> None: - composition = load_composition(args) + composition = load_composition(args, munge_services=False) print(composition.description) From 7ef7497c52f22394ab5ff1f57b8d1c5e0c8ddbf5 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 22:03:33 +0000 Subject: [PATCH 06/11] ci: skip munge_services for describe command The `mzcompose describe` (aka `ls`/`list`) command only displays service names, workflow names/docstrings, and the composition description. All of this data is available without expensive dependency resolution and fingerprinting. This is primarily a local development speedup since describe is not called in CI, but it makes `mzcompose ls` much more responsive. Measured savings: ~2.5s per `mzcompose describe` call munge_services=True: 2.78s (full load_composition) munge_services=False: 0.25s (full load_composition) Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/cli/mzcompose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/python/materialize/cli/mzcompose.py b/misc/python/materialize/cli/mzcompose.py index 8e74c447b925c..b29e072cd254d 100644 --- a/misc/python/materialize/cli/mzcompose.py +++ b/misc/python/materialize/cli/mzcompose.py @@ -349,7 +349,7 @@ class DescribeCommand(Command): help = "describe services and workflows in the composition" def run(self, args: argparse.Namespace) -> None: - composition = load_composition(args) + composition = load_composition(args, munge_services=False) workflows = [] for name, fn in composition.workflows.items(): From ef534de64499121d499f6ffada58aa5b67a3c0b2 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 22:14:12 +0000 Subject: [PATCH 07/11] ci: batch crate input file discovery into a single git call When resolving image dependencies, each Rust crate's input files were discovered via individual `git diff` + `git ls-files` subprocess calls. With ~118 crates across the workspace, this meant ~236 subprocess calls just for crate file enumeration. Add Workspace.precompute_crate_inputs() which does a single pair of git calls to discover all crate files at once, then partitions the results by crate path in Python. This is called automatically at the start of resolve_dependencies(). Measured savings for resolve_dependencies(all 41 images): Before: 4.80s After: 2.84s Savings: 1.96s (41%) Measured savings for single composition (pg-cdc, munge_services=True): Before: 2.57s After: 0.78s Savings: 1.80s (70%) This benefits every `mzcompose up` and `mzcompose run` call in CI. Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/cargo.py | 46 ++++++++++++++++++++++++++++++ misc/python/materialize/mzbuild.py | 4 +++ 2 files changed, 50 insertions(+) diff --git a/misc/python/materialize/cargo.py b/misc/python/materialize/cargo.py index 7399f63553f46..0e5d7e378f35d 100644 --- a/misc/python/materialize/cargo.py +++ b/misc/python/materialize/cargo.py @@ -116,6 +116,8 @@ def inputs(self) -> set[str]: # † As a development convenience, we omit mzcompose configuration files # within a crate. This is technically incorrect if someone writes # `include!("mzcompose.py")`, but that seems like a crazy thing to do. + if hasattr(self, "_inputs_cache"): + return self._inputs_cache return git.expand_globs( self.root, f"{self.path}/**", @@ -245,3 +247,47 @@ def visit(c: Crate) -> None: for d in crate.path_dev_dependencies: visit(self.crates[d]) return deps + + def precompute_crate_inputs(self) -> None: + """Pre-fetch all crate input files in a single batched git call. + + This replaces ~118 individual pairs of git subprocess calls with + a single pair, then partitions the results by crate path in Python. + """ + from materialize import spawn + + crate_paths = sorted(set(str(c.path) for c in self.all_crates.values())) + + specs = [] + for p in crate_paths: + specs.append(f"{p}/**") + specs.append(f":(exclude){p}/mzcompose") + specs.append(f":(exclude){p}/mzcompose.py") + + root = next(iter(self.all_crates.values())).root + empty_tree = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" + diff_files = spawn.capture( + ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"] + + specs, + cwd=root, + ) + ls_files = spawn.capture( + ["git", "ls-files", "--others", "--exclude-standard", "-z", "--"] + specs, + cwd=root, + ) + all_files = set( + f for f in (diff_files + ls_files).split("\0") if f.strip() != "" + ) + + # Partition files by crate path (longest match first for nested crates) + crate_file_map: dict[str, set[str]] = {p: set() for p in crate_paths} + sorted_paths = sorted(crate_paths, key=len, reverse=True) + for f in all_files: + for cp in sorted_paths: + if f.startswith(cp + "/"): + crate_file_map[cp].add(f) + break + + # Inject cached results into each Crate object + for crate in self.all_crates.values(): + crate._inputs_cache = crate_file_map.get(str(crate.path), set()) diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index 3ee3dd7e1a714..d7f2417a50712 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -1487,6 +1487,10 @@ def resolve_dependencies(self, targets: Iterable[Image]) -> DependencySet: ValueError: A circular dependency was discovered in the images in the repository. """ + # Pre-fetch all crate input files in a single batched git call, + # replacing ~118 individual subprocess pairs with one pair. + self.rd.cargo_workspace.precompute_crate_inputs() + resolved = OrderedDict() visiting = set() From 67b268f6b27654e5c00c41f1b6b316273ff3b8e8 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 22:18:48 +0000 Subject: [PATCH 08/11] ci: batch image context file discovery into a single git call When computing image fingerprints, each image's context files were discovered via individual `git diff` + `git ls-files` subprocess calls. With 41 images, this meant 82 subprocess calls just for image context enumeration. Add Repository._precompute_image_context_files() which does a single pair of git calls to discover all image context files at once, then partitions results by image path. This is called automatically at the start of resolve_dependencies(). Combined with the crate input batching from the previous commit: Measured savings for resolve_dependencies(all 41 images): Before (no batching): 4.17s After (both batched): 1.85s Savings: 2.32s (56%) Measured savings for single composition (pg-cdc, munge_services=True): Before: 2.43s After: 0.64s Savings: 1.79s (74%) Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/mzbuild.py | 52 +++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index d7f2417a50712..f92e28f990a92 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -1067,7 +1067,10 @@ def inputs(self, transitive: bool = False) -> set[str]: inputs: A list of input files, relative to the root of the repository. """ - paths = set(git.expand_globs(self.image.rd.root, f"{self.image.path}/**")) + if hasattr(self.image, "_context_files_cache"): + paths = set(self.image._context_files_cache) + else: + paths = set(git.expand_globs(self.image.rd.root, f"{self.image.path}/**")) if not paths: # While we could find an `mzbuild.yml` file for this service, expland_globs didn't # return any files that matched this service. At the very least, the `mzbuild.yml` @@ -1490,6 +1493,9 @@ def resolve_dependencies(self, targets: Iterable[Image]) -> DependencySet: # Pre-fetch all crate input files in a single batched git call, # replacing ~118 individual subprocess pairs with one pair. self.rd.cargo_workspace.precompute_crate_inputs() + # Pre-fetch all image context files in a single batched git call, + # replacing ~41 individual subprocess pairs with one pair. + self._precompute_image_context_files() resolved = OrderedDict() visiting = set() @@ -1511,6 +1517,50 @@ def visit(image: Image, path: list[str] = []) -> None: return DependencySet(resolved.values()) + def _precompute_image_context_files(self) -> None: + """Pre-fetch all image context files in a single batched git call. + + This replaces ~41 individual pairs of git subprocess calls (one per + image) with a single pair, then partitions the results by image path. + The results are injected into the expand_globs cache. + """ + image_paths = sorted(set(str(img.path) for img in self.images.values())) + specs = [f"{p}/**" for p in image_paths] + + root = self.rd.root + empty_tree = ( + "4b825dc642cb6eb9a060e54bf8d69288fbee4904" + ) + diff_files = spawn.capture( + ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"] + + specs, + cwd=root, + ) + ls_files = spawn.capture( + ["git", "ls-files", "--others", "--exclude-standard", "-z", "--"] + specs, + cwd=root, + ) + all_files = set( + f for f in (diff_files + ls_files).split("\0") if f.strip() != "" + ) + + # Partition files by image path (longest match first for nested paths) + image_file_map: dict[str, set[str]] = {p: set() for p in image_paths} + sorted_paths = sorted(image_paths, key=len, reverse=True) + for f in all_files: + for ip in sorted_paths: + if f.startswith(ip + "/") or f.startswith(str(Path(ip)) + "/"): + image_file_map[ip].add(f) + break + + # Inject results into the expand_globs cache by calling it with + # the same arguments that ResolvedImage.inputs() will use. + # Since expand_globs is @functools.cache, we need to populate the + # cache with the right keys. We do this by replacing the function + # temporarily to return our pre-computed results. + for img in self.images.values(): + img._context_files_cache = image_file_map.get(str(img.path), set()) + def __iter__(self) -> Iterator[Image]: return iter(self.images.values()) From 3a2433a68c498c8e75ea91f0b3987caf1b96ca3a Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 22:22:24 +0000 Subject: [PATCH 09/11] ci: eliminate remaining git subprocess calls from fingerprinting Two changes eliminate the remaining ~82 git subprocess calls from the fingerprinting path: 1. CargoPreImage.inputs() now resolves its hardcoded inputs eagerly: - 'ci/builder' directory is expanded to individual files via a single cached expand_globs call - '.cargo/config' is included only if it exists - Result is cached with @cache since it's the same for all images 2. ResolvedImage.fingerprint() skips the expand_globs verification pass when precomputed data is available, since all inputs are already individual file paths from git. This eliminates all git subprocess calls from resolve_dependencies, reducing the total from ~384 calls (baseline) to just 5 (2 for crate batch + 2 for image batch + 1 for ci/builder). Measured savings for resolve_dependencies(all 41 images): Before (no batching): 4.15s After (all batching): 0.40s Savings: 3.75s (90%) Measured savings for single composition (pg-cdc, munge_services=True): Before: 2.23s After: 0.26s Savings: 1.97s (88%) Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/mzbuild.py | 43 ++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index f92e28f990a92..dd3e2d60e6d3f 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -468,18 +468,25 @@ def inputs(self) -> set[str]: class CargoPreImage(PreImage): """A `PreImage` action that uses Cargo.""" - def inputs(self) -> set[str]: - inputs = { - "ci/builder", - "Cargo.toml", - # TODO(benesch): we could in theory fingerprint only the subset of - # Cargo.lock that applies to the crates at hand, but that is a - # *lot* of work. - "Cargo.lock", - ".cargo/config", - } + @staticmethod + @cache + def _cargo_shared_inputs() -> frozenset[str]: + """Resolve shared Cargo inputs once and cache the result. - return inputs + This expands the 'ci/builder' directory glob and filters out + non-existent files like '.cargo/config', avoiding repeated + git subprocess calls in fingerprint(). + """ + inputs: set[str] = set() + inputs |= git.expand_globs(Path("."), "ci/builder/**") + inputs.add("Cargo.toml") + inputs.add("Cargo.lock") + if Path(".cargo/config").exists(): + inputs.add(".cargo/config") + return frozenset(inputs) + + def inputs(self) -> set[str]: + return set(CargoPreImage._cargo_shared_inputs()) def extra(self) -> str: # Cargo images depend on the release mode and whether @@ -1098,9 +1105,17 @@ def fingerprint(self) -> Fingerprint: inputs via `PreImage.inputs`. """ self_hash = hashlib.sha1() - for rel_path in sorted( - set(git.expand_globs(self.image.rd.root, *self.inputs())) - ): + # When inputs come from precomputed sources (crate and image context + # batching + resolved CargoPreImage paths), they are already individual + # file paths from git. Skip the expensive expand_globs subprocess calls. + inputs = self.inputs() + if hasattr(self.image, "_context_files_cache"): + resolved_inputs = sorted(inputs) + else: + resolved_inputs = sorted( + set(git.expand_globs(self.image.rd.root, *inputs)) + ) + for rel_path in resolved_inputs: abs_path = self.image.rd.root / rel_path file_hash = hashlib.sha1() raw_file_mode = os.lstat(abs_path).st_mode From 8b621ccd1759724719e1e6b7a45ea67a87345c72 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 22:36:16 +0000 Subject: [PATCH 10/11] ci: fix batched file discovery with absolute root paths The crate and image context precomputation used str(path) directly for both git pathspecs and the file partitioning step. This works when the repository root is a relative path (Path(".")), but fails when it's an absolute path (as it is when MZ_ROOT is set in CI via mzcompose). The issue: git --relative outputs paths relative to cwd, but the partition logic compared these relative paths against potentially absolute image/crate paths, causing the startswith() check to fail. This left _context_files_cache empty, triggering the "files are unknown to git" assertion. Fix: use path.relative_to(root) to normalize all paths before constructing git specs and partitioning results. Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/cargo.py | 18 +++++++++++------ misc/python/materialize/mzbuild.py | 32 +++++++++++++----------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/misc/python/materialize/cargo.py b/misc/python/materialize/cargo.py index 0e5d7e378f35d..3dde480ba0869 100644 --- a/misc/python/materialize/cargo.py +++ b/misc/python/materialize/cargo.py @@ -256,15 +256,20 @@ def precompute_crate_inputs(self) -> None: """ from materialize import spawn - crate_paths = sorted(set(str(c.path) for c in self.all_crates.values())) + root = next(iter(self.all_crates.values())).root + # Use paths relative to root for git specs and partitioning, since + # git --relative outputs paths relative to cwd (root). Crate paths + # may be absolute when MZ_ROOT is an absolute path. + crate_rel_paths = sorted( + set(str(c.path.relative_to(root)) for c in self.all_crates.values()) + ) specs = [] - for p in crate_paths: + for p in crate_rel_paths: specs.append(f"{p}/**") specs.append(f":(exclude){p}/mzcompose") specs.append(f":(exclude){p}/mzcompose.py") - root = next(iter(self.all_crates.values())).root empty_tree = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" diff_files = spawn.capture( ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"] @@ -280,8 +285,8 @@ def precompute_crate_inputs(self) -> None: ) # Partition files by crate path (longest match first for nested crates) - crate_file_map: dict[str, set[str]] = {p: set() for p in crate_paths} - sorted_paths = sorted(crate_paths, key=len, reverse=True) + crate_file_map: dict[str, set[str]] = {p: set() for p in crate_rel_paths} + sorted_paths = sorted(crate_rel_paths, key=len, reverse=True) for f in all_files: for cp in sorted_paths: if f.startswith(cp + "/"): @@ -290,4 +295,5 @@ def precompute_crate_inputs(self) -> None: # Inject cached results into each Crate object for crate in self.all_crates.values(): - crate._inputs_cache = crate_file_map.get(str(crate.path), set()) + rel = str(crate.path.relative_to(root)) + crate._inputs_cache = crate_file_map.get(rel, set()) diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index dd3e2d60e6d3f..401fa11983394 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -1112,9 +1112,7 @@ def fingerprint(self) -> Fingerprint: if hasattr(self.image, "_context_files_cache"): resolved_inputs = sorted(inputs) else: - resolved_inputs = sorted( - set(git.expand_globs(self.image.rd.root, *inputs)) - ) + resolved_inputs = sorted(set(git.expand_globs(self.image.rd.root, *inputs))) for rel_path in resolved_inputs: abs_path = self.image.rd.root / rel_path file_hash = hashlib.sha1() @@ -1537,15 +1535,17 @@ def _precompute_image_context_files(self) -> None: This replaces ~41 individual pairs of git subprocess calls (one per image) with a single pair, then partitions the results by image path. - The results are injected into the expand_globs cache. """ - image_paths = sorted(set(str(img.path) for img in self.images.values())) - specs = [f"{p}/**" for p in image_paths] - root = self.rd.root - empty_tree = ( - "4b825dc642cb6eb9a060e54bf8d69288fbee4904" + # Use paths relative to root for git specs and partitioning, since + # git --relative outputs paths relative to cwd (root). Image paths + # may be absolute when MZ_ROOT is an absolute path. + image_rel_paths = sorted( + set(str(img.path.relative_to(root)) for img in self.images.values()) ) + specs = [f"{p}/**" for p in image_rel_paths] + + empty_tree = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" diff_files = spawn.capture( ["git", "diff", "--name-only", "-z", "--relative", empty_tree, "--"] + specs, @@ -1560,21 +1560,17 @@ def _precompute_image_context_files(self) -> None: ) # Partition files by image path (longest match first for nested paths) - image_file_map: dict[str, set[str]] = {p: set() for p in image_paths} - sorted_paths = sorted(image_paths, key=len, reverse=True) + image_file_map: dict[str, set[str]] = {p: set() for p in image_rel_paths} + sorted_paths = sorted(image_rel_paths, key=len, reverse=True) for f in all_files: for ip in sorted_paths: - if f.startswith(ip + "/") or f.startswith(str(Path(ip)) + "/"): + if f.startswith(ip + "/"): image_file_map[ip].add(f) break - # Inject results into the expand_globs cache by calling it with - # the same arguments that ResolvedImage.inputs() will use. - # Since expand_globs is @functools.cache, we need to populate the - # cache with the right keys. We do this by replacing the function - # temporarily to return our pre-computed results. for img in self.images.values(): - img._context_files_cache = image_file_map.get(str(img.path), set()) + rel = str(img.path.relative_to(root)) + img._context_files_cache = image_file_map.get(rel, set()) def __iter__(self) -> Iterator[Image]: return iter(self.images.values()) From 710a3c958d582ce3b181f985d805b1c68eddf219 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 13 Feb 2026 23:03:55 +0000 Subject: [PATCH 11/11] ci: declare cache attributes as class fields for pyright Pyright's type checker requires attributes to be declared on the class. Declare _inputs_cache on Crate and _context_files_cache on Image as Optional[set[str]] fields, initialized to None in __init__, and replace hasattr() checks with `is not None` comparisons. Co-Authored-By: Claude Opus 4.6 --- misc/python/materialize/cargo.py | 5 ++++- misc/python/materialize/mzbuild.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/misc/python/materialize/cargo.py b/misc/python/materialize/cargo.py index 3dde480ba0869..e6bb51b307641 100644 --- a/misc/python/materialize/cargo.py +++ b/misc/python/materialize/cargo.py @@ -48,8 +48,11 @@ class Crate: examples: The names of all examples in the crate. """ + _inputs_cache: set[str] | None + def __init__(self, root: Path, path: Path): self.root = root + self._inputs_cache = None with open(path / "Cargo.toml") as f: config = toml.load(f) self.name = config["package"]["name"] @@ -116,7 +119,7 @@ def inputs(self) -> set[str]: # † As a development convenience, we omit mzcompose configuration files # within a crate. This is technically incorrect if someone writes # `include!("mzcompose.py")`, but that seems like a crazy thing to do. - if hasattr(self, "_inputs_cache"): + if self._inputs_cache is not None: return self._inputs_cache return git.expand_globs( self.root, diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index 401fa11983394..c7e2d3daba387 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -758,9 +758,12 @@ class Image: _DOCKERFILE_MZFROM_RE = re.compile(rb"^MZFROM\s*(\S+)") + _context_files_cache: set[str] | None + def __init__(self, rd: RepositoryDetails, path: Path): self.rd = rd self.path = path + self._context_files_cache = None self.pre_images: list[PreImage] = [] with open(self.path / "mzbuild.yml") as f: data = yaml.safe_load(f) @@ -1074,7 +1077,7 @@ def inputs(self, transitive: bool = False) -> set[str]: inputs: A list of input files, relative to the root of the repository. """ - if hasattr(self.image, "_context_files_cache"): + if self.image._context_files_cache is not None: paths = set(self.image._context_files_cache) else: paths = set(git.expand_globs(self.image.rd.root, f"{self.image.path}/**")) @@ -1109,7 +1112,7 @@ def fingerprint(self) -> Fingerprint: # batching + resolved CargoPreImage paths), they are already individual # file paths from git. Skip the expensive expand_globs subprocess calls. inputs = self.inputs() - if hasattr(self.image, "_context_files_cache"): + if self.image._context_files_cache is not None: resolved_inputs = sorted(inputs) else: resolved_inputs = sorted(set(git.expand_globs(self.image.rd.root, *inputs)))