From a88987e089816e8cea4165e4ad4d6a2b84352218 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 18:35:41 +0000 Subject: [PATCH 1/9] refactor(git): improve code health in git.py (CodeScene issues) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address four CodeScene-flagged issues in dfetch/vcs/git.py: 1. Bumpy Road – _move_src_folder_up (complexity 10 → 4) Extract _collect_source_dirs() and _move_directory_contents() to eliminate three nested-conditional bumps. 2. Bumpy Road – create_diff (2 bumps) Extract _build_hash_args() and _build_ignore_args() so each conditional block has a single, named responsibility. 3. Code Duplication – get_username / get_useremail Introduce _get_git_config_value(key) helper; both methods now delegate to it, removing structural copy-paste. 4. Too Many Arguments – checkout_version (5 args → 1) Introduce CheckoutOptions dataclass; update the single call site in gitsubproject.py accordingly. All 50 existing tests continue to pass. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/project/gitsubproject.py | 14 +-- dfetch/vcs/git.py | 178 +++++++++++++++++++++----------- 2 files changed, 128 insertions(+), 64 deletions(-) diff --git a/dfetch/project/gitsubproject.py b/dfetch/project/gitsubproject.py index 757027b4..21ee85d3 100644 --- a/dfetch/project/gitsubproject.py +++ b/dfetch/project/gitsubproject.py @@ -9,7 +9,7 @@ from dfetch.project.metadata import Dependency from dfetch.project.subproject import SubProject from dfetch.util.util import LICENSE_GLOBS, safe_rm -from dfetch.vcs.git import GitLocalRepo, GitRemote, get_git_version +from dfetch.vcs.git import CheckoutOptions, GitLocalRepo, GitRemote, get_git_version logger = get_logger(__name__) @@ -70,11 +70,13 @@ def _fetch_impl(self, version: Version) -> tuple[Version, list[Dependency]]: local_repo = GitLocalRepo(self.local_path) fetched_sha, submodules = local_repo.checkout_version( - remote=self.remote, - version=rev_or_branch_or_tag, - src=self.source, - must_keeps=license_globs + [".gitmodules"], - ignore=self.ignore, + CheckoutOptions( + remote=self.remote, + version=rev_or_branch_or_tag, + src=self.source, + must_keeps=license_globs + [".gitmodules"], + ignore=self.ignore, + ) ) vcs_deps = [] diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index d803cc62..bde6574b 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -31,6 +31,17 @@ class Submodule: tag: str +@dataclass +class CheckoutOptions: + """Options for checking out a specific version from a remote git repository.""" + + remote: str + version: str + src: str | None = None + must_keeps: list[str] | None = None + ignore: Sequence[str] | None = None + + def get_git_version() -> tuple[str, str]: """Get the name and version of git.""" result = run_on_cmdline(logger, ["git", "--version"]) @@ -270,35 +281,28 @@ def _configure_sparse_checkout( f.write("\n".join(map(str, patterns)) + "\n") - def checkout_version( # pylint: disable=too-many-arguments + def checkout_version( self, - *, - remote: str, - version: str, - src: str | None = None, - must_keeps: list[str] | None = None, - ignore: Sequence[str] | None = None, + options: CheckoutOptions, ) -> tuple[str, list[Submodule]]: """Checkout a specific version from a given remote. Args: - remote (str): Url or path to a remote git repository - version (str): A target to checkout, can be branch, tag or sha - src (Optional[str]): Optional path to subdirectory or file in repo - must_keeps (Optional[List[str]]): Optional list of glob patterns to keep - ignore (Optional[Sequence[str]]): Optional sequence of glob patterns to ignore (relative to src) + options: A :class:`CheckoutOptions` instance describing what to fetch. """ with in_directory(self._path): run_on_cmdline(logger, ["git", "init"]) - run_on_cmdline(logger, ["git", "remote", "add", "origin", remote]) + run_on_cmdline(logger, ["git", "remote", "add", "origin", options.remote]) run_on_cmdline(logger, ["git", "checkout", "-b", "dfetch-local-branch"]) - if src or ignore: - self._configure_sparse_checkout(src, must_keeps or [], ignore) + if options.src or options.ignore: + self._configure_sparse_checkout( + options.src, options.must_keeps or [], options.ignore + ) run_on_cmdline( logger, - ["git", "fetch", "--depth", "1", "origin", version], + ["git", "fetch", "--depth", "1", "origin", options.version], env=_extend_env_for_non_interactive_mode(), ) run_on_cmdline(logger, ["git", "reset", "--hard", "FETCH_HEAD"]) @@ -317,7 +321,9 @@ def checkout_version( # pylint: disable=too-many-arguments .strip() ) - submodules = self._apply_src_and_ignore(remote, src, ignore, submodules) + submodules = self._apply_src_and_ignore( + options.remote, options.src, options.ignore, submodules + ) return str(current_sha), submodules @@ -344,6 +350,51 @@ def _apply_src_and_ignore( return [s for s in submodules if os.path.exists(s.path)] + @staticmethod + def _collect_source_dirs(matched_paths: list[str]) -> list[str]: + """Collect unique parent directories from a list of matched paths. + + For each path, if it is a directory it is used directly; if it is a file, + its parent directory is used instead. Ordering is preserved and duplicates + are removed. + + Args: + matched_paths: Sorted list of glob-matched filesystem paths. + + Returns: + Ordered list of unique directory paths. + """ + dirs: list[str] = [] + for src_dir_path in matched_paths: + if os.path.isdir(src_dir_path): + dirs.append(src_dir_path) + else: + dir_path = os.path.dirname(src_dir_path) + if dir_path: + dirs.append(dir_path) + return list(dict.fromkeys(dirs)) + + @staticmethod + def _move_directory_contents(src_dir_path: str, remote: str, src: str) -> None: + """Move every file inside *src_dir_path* into the current working directory. + + After all files have been moved the top-level ancestor of *src_dir_path* + is removed. + + Args: + src_dir_path: Path to the directory whose contents should be moved. + remote: Remote name used only for warning messages. + src: Original src filter used only for warning messages. + """ + try: + for file_to_copy in os.listdir(src_dir_path): + shutil.move(src_dir_path + "/" + file_to_copy, ".") + safe_rm(PurePath(src_dir_path).parts[0]) + except FileNotFoundError: + logger.warning( + f"The 'src:' filter '{src_dir_path}' didn't match any files from '{remote}'" + ) + @staticmethod def _move_src_folder_up(remote: str, src: str) -> None: """Move the files from the src folder into the root of the project. @@ -360,15 +411,7 @@ def _move_src_folder_up(remote: str, src: str) -> None: ) return - dirs = [] - for src_dir_path in matched_paths: - if os.path.isdir(src_dir_path): - dirs.append(src_dir_path) - else: - if dir_path := os.path.dirname(src_dir_path): - dirs.append(dir_path) - - unique_dirs = list(dict.fromkeys(dirs)) + unique_dirs = GitLocalRepo._collect_source_dirs(matched_paths) if len(unique_dirs) > 1: logger.warning( @@ -377,15 +420,7 @@ def _move_src_folder_up(remote: str, src: str) -> None: ) for src_dir_path in unique_dirs[:1]: - try: - for file_to_copy in os.listdir(src_dir_path): - shutil.move(src_dir_path + "/" + file_to_copy, ".") - safe_rm(PurePath(src_dir_path).parts[0]) - except FileNotFoundError: - logger.warning( - f"The 'src:' filter '{src_dir_path}' didn't match any files from '{remote}'" - ) - continue + GitLocalRepo._move_directory_contents(src_dir_path, remote, src) @staticmethod def _determine_ignore_paths( @@ -435,6 +470,39 @@ def get_remote_url() -> str: return decoded_result + @staticmethod + def _build_hash_args(old_hash: str | None, new_hash: str | None) -> list[str]: + """Build the git-diff positional hash arguments. + + Returns a list containing the old hash (and optionally the new hash) to + be appended to a ``git diff`` command. + + Args: + old_hash: The base commit SHA, or ``None`` to diff against the working tree. + new_hash: The target commit SHA, or ``None`` to omit it. + + Returns: + A list of zero, one, or two SHA strings. + """ + if not old_hash: + return [] + return [old_hash, new_hash] if new_hash else [old_hash] + + @staticmethod + def _build_ignore_args(ignore: Sequence[str] | None) -> list[str]: + """Build the git-diff pathspec arguments that exclude ignored paths. + + Args: + ignore: Sequence of glob patterns to exclude, or ``None`` for no exclusions. + + Returns: + A list of pathspec arguments starting with ``-- .`` followed by + ``:(exclude)`` entries, or an empty list when *ignore* is falsy. + """ + if not ignore: + return [] + return ["--", "."] + [f":(exclude){p}" for p in ignore] + def create_diff( self, old_hash: str | None, @@ -456,15 +524,9 @@ def create_diff( if reverse: cmd.extend(["-R", "--src-prefix=b/", "--dst-prefix=a/"]) - if old_hash: - cmd.append(old_hash) - if new_hash: - cmd.append(new_hash) + cmd.extend(GitLocalRepo._build_hash_args(old_hash, new_hash)) + cmd.extend(GitLocalRepo._build_ignore_args(ignore)) - if ignore: - cmd.extend(["--", "."]) - for ignore_path in ignore: - cmd.append(f":(exclude){ignore_path}") result = run_on_cmdline(logger, cmd) return str(result.stdout.decode()) @@ -646,26 +708,26 @@ def find_branch_containing_sha(self, sha: str) -> str: return "" if not branches else branches[0] - def get_username(self) -> str: - """Get the username of the local git repo.""" + def _get_git_config_value(self, key: str) -> str: + """Read a single git config value from the local repo. + + Args: + key: The git config key to query (e.g. ``user.name``). + + Returns: + The stripped config value, or an empty string if the key is absent. + """ try: with in_directory(self._path): - result = run_on_cmdline( - logger, - ["git", "config", "user.name"], - ) + result = run_on_cmdline(logger, ["git", "config", key]) return str(result.stdout.decode().strip()) except SubprocessCommandError: return "" + def get_username(self) -> str: + """Get the username of the local git repo.""" + return self._get_git_config_value("user.name") + def get_useremail(self) -> str: """Get the user email of the local git repo.""" - try: - with in_directory(self._path): - result = run_on_cmdline( - logger, - ["git", "config", "user.email"], - ) - return str(result.stdout.decode().strip()) - except SubprocessCommandError: - return "" + return self._get_git_config_value("user.email") From 7a3c183c741d5ccb92b568f350228fba2a25489a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 18:40:21 +0000 Subject: [PATCH 2/9] refactor(git): clean up idioms, fix docstrings, add unit tests for helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace `for x in list[:1]:` anti-pattern with `if list: x = list[0]` - Fix misleading any_changes_or_untracked docstring (returns bool, not a list) - Trim Args/Returns blocks from 2-line private helpers (_build_hash_args, _build_ignore_args, _collect_source_dirs) — code already says it - Add 14 parametrized unit tests covering all edge cases of the three newly extracted pure helpers (_collect_source_dirs, _build_hash_args, _build_ignore_args) https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/vcs/git.py | 42 +++++------------------------ tests/test_git_vcs.py | 61 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 36 deletions(-) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index bde6574b..9c40eead 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -352,18 +352,7 @@ def _apply_src_and_ignore( @staticmethod def _collect_source_dirs(matched_paths: list[str]) -> list[str]: - """Collect unique parent directories from a list of matched paths. - - For each path, if it is a directory it is used directly; if it is a file, - its parent directory is used instead. Ordering is preserved and duplicates - are removed. - - Args: - matched_paths: Sorted list of glob-matched filesystem paths. - - Returns: - Ordered list of unique directory paths. - """ + """Return unique parent directories for each matched path, preserving order.""" dirs: list[str] = [] for src_dir_path in matched_paths: if os.path.isdir(src_dir_path): @@ -419,8 +408,8 @@ def _move_src_folder_up(remote: str, src: str) -> None: f"Only considering files in '{unique_dirs[0]}'." ) - for src_dir_path in unique_dirs[:1]: - GitLocalRepo._move_directory_contents(src_dir_path, remote, src) + if unique_dirs: + GitLocalRepo._move_directory_contents(unique_dirs[0], remote, src) @staticmethod def _determine_ignore_paths( @@ -472,33 +461,14 @@ def get_remote_url() -> str: @staticmethod def _build_hash_args(old_hash: str | None, new_hash: str | None) -> list[str]: - """Build the git-diff positional hash arguments. - - Returns a list containing the old hash (and optionally the new hash) to - be appended to a ``git diff`` command. - - Args: - old_hash: The base commit SHA, or ``None`` to diff against the working tree. - new_hash: The target commit SHA, or ``None`` to omit it. - - Returns: - A list of zero, one, or two SHA strings. - """ + """Return the SHA positional arguments for git diff (zero, one, or two hashes).""" if not old_hash: return [] return [old_hash, new_hash] if new_hash else [old_hash] @staticmethod def _build_ignore_args(ignore: Sequence[str] | None) -> list[str]: - """Build the git-diff pathspec arguments that exclude ignored paths. - - Args: - ignore: Sequence of glob patterns to exclude, or ``None`` for no exclusions. - - Returns: - A list of pathspec arguments starting with ``-- .`` followed by - ``:(exclude)`` entries, or an empty list when *ignore* is falsy. - """ + """Return git-diff pathspec arguments that exclude each pattern in *ignore*.""" if not ignore: return [] return ["--", "."] + [f":(exclude){p}" for p in ignore] @@ -556,7 +526,7 @@ def ignored_files(path: str) -> Sequence[str]: @staticmethod def any_changes_or_untracked(path: str) -> bool: - """List of any changed files.""" + """Return True if the repo at *path* has any changed or untracked files.""" if not Path(path).exists(): raise RuntimeError("Path does not exist.") diff --git a/tests/test_git_vcs.py b/tests/test_git_vcs.py index 0089dcd4..50cbeaef 100644 --- a/tests/test_git_vcs.py +++ b/tests/test_git_vcs.py @@ -17,6 +17,67 @@ ) +# --------------------------------------------------------------------------- +# GitLocalRepo._collect_source_dirs +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "name, paths, isdir_results, expected", + [ + ("empty input", [], [], []), + ("all directories", ["a", "b"], [True, True], ["a", "b"]), + ("all files with parent", ["a/x.c", "b/y.c"], [False, False], ["a", "b"]), + ("file at root — no parent dir", ["x.c"], [False], []), + ("mixed dir and file in same parent", ["a", "a/y.c"], [True, False], ["a"]), + ("deduplication preserves order", ["a/x.c", "a/y.c"], [False, False], ["a"]), + ], +) +def test_collect_source_dirs(name, paths, isdir_results, expected): + with patch("dfetch.vcs.git.os.path.isdir", side_effect=isdir_results): + assert GitLocalRepo._collect_source_dirs(paths) == expected + + +# --------------------------------------------------------------------------- +# GitLocalRepo._build_hash_args +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "name, old_hash, new_hash, expected", + [ + ("no hashes", None, None, []), + ("old hash only", "abc123", None, ["abc123"]), + ("both hashes", "abc123", "def456", ["abc123", "def456"]), + ("new hash without old hash is ignored", None, "def456", []), + ], +) +def test_build_hash_args(name, old_hash, new_hash, expected): + assert GitLocalRepo._build_hash_args(old_hash, new_hash) == expected + + +# --------------------------------------------------------------------------- +# GitLocalRepo._build_ignore_args +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "name, ignore, expected", + [ + ("None", None, []), + ("empty sequence", [], []), + ("single pattern", ["*.txt"], ["--", ".", ":(exclude)*.txt"]), + ( + "multiple patterns", + ["*.txt", "build/"], + ["--", ".", ":(exclude)*.txt", ":(exclude)build/"], + ), + ], +) +def test_build_ignore_args(name, ignore, expected): + assert GitLocalRepo._build_ignore_args(ignore) == expected + + @pytest.mark.parametrize( "name, cmd_result, expectation", [ From 9c19cf2aa4827c84a086e7780cef8d61bf5ccfc8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 18:46:55 +0000 Subject: [PATCH 3/9] fix(git): remove unused `src` arg from _move_directory_contents; fix isort _move_directory_contents accepted `src` for warning messages but the warning already used `src_dir_path`. Dropped the dead parameter to satisfy pylint (10.00/10). Also applied isort to test file. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/vcs/git.py | 9 ++------- tests/test_git_vcs.py | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index 9c40eead..de68d062 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -364,16 +364,11 @@ def _collect_source_dirs(matched_paths: list[str]) -> list[str]: return list(dict.fromkeys(dirs)) @staticmethod - def _move_directory_contents(src_dir_path: str, remote: str, src: str) -> None: + def _move_directory_contents(src_dir_path: str, remote: str) -> None: """Move every file inside *src_dir_path* into the current working directory. After all files have been moved the top-level ancestor of *src_dir_path* is removed. - - Args: - src_dir_path: Path to the directory whose contents should be moved. - remote: Remote name used only for warning messages. - src: Original src filter used only for warning messages. """ try: for file_to_copy in os.listdir(src_dir_path): @@ -409,7 +404,7 @@ def _move_src_folder_up(remote: str, src: str) -> None: ) if unique_dirs: - GitLocalRepo._move_directory_contents(unique_dirs[0], remote, src) + GitLocalRepo._move_directory_contents(unique_dirs[0], remote) @staticmethod def _determine_ignore_paths( diff --git a/tests/test_git_vcs.py b/tests/test_git_vcs.py index 50cbeaef..c76d63f8 100644 --- a/tests/test_git_vcs.py +++ b/tests/test_git_vcs.py @@ -16,7 +16,6 @@ _build_git_ssh_command, ) - # --------------------------------------------------------------------------- # GitLocalRepo._collect_source_dirs # --------------------------------------------------------------------------- From f07eec234071317333b380e293f5a31a5d0eb733 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 19:09:37 +0000 Subject: [PATCH 4/9] refactor: apply same code-health fixes to archive and svn modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit archive.py – _http_download (5 → 3 levels deep) Extract _stream_response_to_file() to handle the write loop, removing the with-open / while-chunk / if-hasher nesting from the redirect loop. svnsubproject.py – _fetch_impl for/break anti-pattern Replace `for file in ...: ...; break` (only ever processes the first element) with `license_files = ...; if license_files:` — same fix applied to _move_src_folder_up in the previous git.py refactor. archivesubproject.py – duplicated try-finally temp-file cleanup Both _download_and_compute_hash() and _fetch_impl() contained identical fd/mkstemp/try/finally/os.remove blocks. Extract a _temp_file() context manager so each caller becomes a clean `with _temp_file(...) as tmp_path:` block. Adds contextlib + Generator imports. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/project/archivesubproject.py | 34 +++++++++++++++-------------- dfetch/project/svnsubproject.py | 7 +++--- dfetch/vcs/archive.py | 19 +++++++++++----- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 91a684b9..1c8f1f67 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -41,9 +41,11 @@ from __future__ import annotations +import contextlib import os import pathlib import tempfile +from collections.abc import Generator from dfetch.log import get_logger from dfetch.manifest.project import ProjectEntry @@ -70,6 +72,20 @@ def _suffix_for_url(url: str) -> str: return ".archive" +@contextlib.contextmanager +def _temp_file(suffix: str) -> Generator[str, None, None]: + """Create a temporary file, yield its path, and always delete it on exit.""" + fd, tmp_path = tempfile.mkstemp(suffix=suffix) + os.close(fd) + try: + yield tmp_path + finally: + try: + os.remove(tmp_path) + except OSError: + pass + + class ArchiveSubProject(SubProject): """A project fetched from a tar/zip archive URL. @@ -126,16 +142,9 @@ def _download_and_compute_hash( """ effective_url = url if url is not None else self.remote remote = ArchiveRemote(effective_url) if url is not None else self._remote_repo - fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(effective_url)) - os.close(fd) - try: + with _temp_file(_suffix_for_url(effective_url)) as tmp_path: hex_digest = remote.download(tmp_path, algorithm=algorithm) return IntegrityHash(algorithm, hex_digest) - finally: - try: - os.remove(tmp_path) - except OSError: - pass def _does_revision_exist(self, revision: str) -> bool: # noqa: ARG002 """Check whether the archive URL is still reachable. @@ -182,9 +191,7 @@ def _fetch_impl(self, version: Version) -> tuple[Version, list[Dependency]]: pathlib.Path(self.local_path).mkdir(parents=True, exist_ok=True) - fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) - os.close(fd) - try: + with _temp_file(_suffix_for_url(self.remote)) as tmp_path: expected = IntegrityHash.parse(revision) if expected: actual_hex = self._remote_repo.download( @@ -204,11 +211,6 @@ def _fetch_impl(self, version: Version) -> tuple[Version, list[Dependency]]: src=self.source, ignore=self.ignore, ) - finally: - try: - os.remove(tmp_path) - except OSError: - pass return version, [] diff --git a/dfetch/project/svnsubproject.py b/dfetch/project/svnsubproject.py index 6333856e..1ec167f8 100644 --- a/dfetch/project/svnsubproject.py +++ b/dfetch/project/svnsubproject.py @@ -136,15 +136,14 @@ def _fetch_impl(self, version: Version) -> tuple[Version, list[Dependency]]: if self.source: root_branch_path = "/".join([self.remote, branch_path]).strip("/") - - for file in SvnSubProject._license_files(root_branch_path): + license_files = SvnSubProject._license_files(root_branch_path) + if license_files: dest = ( self.local_path if os.path.isdir(self.local_path) else os.path.dirname(self.local_path) ) - SvnRepo.export(f"{root_branch_path}/{file}", rev_arg, dest) - break + SvnRepo.export(f"{root_branch_path}/{license_files[0]}", rev_arg, dest) if self.ignore: self._remove_ignored_files() diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index a8caed9a..9cbe513b 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -200,6 +200,19 @@ def download(self, dest_path: str, algorithm: str | None = None) -> str | None: _MAX_REDIRECTS = 10 + @staticmethod + def _stream_response_to_file( + resp: http.client.HTTPResponse, + dest_path: str, + hasher: hashlib._Hash | None, + ) -> None: + """Write the response body to *dest_path*, updating *hasher* for each chunk.""" + with open(dest_path, "wb") as fh: + while chunk := resp.read(65536): + fh.write(chunk) + if hasher: + hasher.update(chunk) + def _http_download( self, parsed: urllib.parse.ParseResult, @@ -232,11 +245,7 @@ def _http_download( raise RuntimeError( f"HTTP {resp.status} when downloading '{self.url}'" ) - with open(dest_path, "wb") as fh: - while chunk := resp.read(65536): - fh.write(chunk) - if hasher: - hasher.update(chunk) + self._stream_response_to_file(resp, dest_path, hasher) return except (OSError, http.client.HTTPException) as exc: raise RuntimeError( From 3a91f966e181d6308d9243ac466ad4f13146097c Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 21:58:36 +0000 Subject: [PATCH 5/9] refactor: move file helpers to util.py with intent-revealing names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three private helpers extracted in earlier commits belong in util.py — they are general filesystem utilities with no VCS-specific logic. util.py — three additions: temp_file(suffix) Context manager that creates a temp file, yields its path, and guarantees deletion on exit. Replaces the local _temp_file in archivesubproject.py. unique_parent_dirs(paths) Returns the unique parent directories for a list of paths (dirs are kept as-is; files resolve to their parent). Name describes the result, not the mechanism. Replaces GitLocalRepo._collect_source_dirs whose name described the git-specific context it was extracted from. Parallels existing copy/move directory helpers. move_directory_contents(src_dir, dest_dir) Moves every entry from src_dir into dest_dir. Complements the existing copy_directory_contents. The git-specific domain logic (safe_rm of the top ancestor + FileNotFoundError warning) stays in git.py's _move_src_folder_up where it belongs. Call-site changes: git.py — imports unique_parent_dirs + move_directory_contents; removes two static methods; drops unused shutil import. archivesubproject.py — imports temp_file; removes contextlib, Generator, os, tempfile imports that were only needed locally. tests/test_git_vcs.py — test_collect_source_dirs → test_unique_parent_dirs; patch target updated to dfetch.util.util.os.path.isdir. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/project/archivesubproject.py | 23 ++---------- dfetch/util/util.py | 42 ++++++++++++++++++++++ dfetch/vcs/git.py | 55 ++++++++++------------------- tests/test_git_vcs.py | 9 ++--- 4 files changed, 69 insertions(+), 60 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 1c8f1f67..86336214 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -41,17 +41,14 @@ from __future__ import annotations -import contextlib -import os import pathlib -import tempfile -from collections.abc import Generator from dfetch.log import get_logger from dfetch.manifest.project import ProjectEntry from dfetch.manifest.version import Version from dfetch.project.metadata import Dependency from dfetch.project.subproject import SubProject +from dfetch.util.util import temp_file from dfetch.vcs.archive import ( ARCHIVE_EXTENSIONS, ArchiveLocalRepo, @@ -72,20 +69,6 @@ def _suffix_for_url(url: str) -> str: return ".archive" -@contextlib.contextmanager -def _temp_file(suffix: str) -> Generator[str, None, None]: - """Create a temporary file, yield its path, and always delete it on exit.""" - fd, tmp_path = tempfile.mkstemp(suffix=suffix) - os.close(fd) - try: - yield tmp_path - finally: - try: - os.remove(tmp_path) - except OSError: - pass - - class ArchiveSubProject(SubProject): """A project fetched from a tar/zip archive URL. @@ -142,7 +125,7 @@ def _download_and_compute_hash( """ effective_url = url if url is not None else self.remote remote = ArchiveRemote(effective_url) if url is not None else self._remote_repo - with _temp_file(_suffix_for_url(effective_url)) as tmp_path: + with temp_file(_suffix_for_url(effective_url)) as tmp_path: hex_digest = remote.download(tmp_path, algorithm=algorithm) return IntegrityHash(algorithm, hex_digest) @@ -191,7 +174,7 @@ def _fetch_impl(self, version: Version) -> tuple[Version, list[Dependency]]: pathlib.Path(self.local_path).mkdir(parents=True, exist_ok=True) - with _temp_file(_suffix_for_url(self.remote)) as tmp_path: + with temp_file(_suffix_for_url(self.remote)) as tmp_path: expected = IntegrityHash.parse(revision) if expected: actual_hex = self._remote_repo.download( diff --git a/dfetch/util/util.py b/dfetch/util/util.py index 01452655..b3a6dfdd 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -5,6 +5,7 @@ import os import shutil import stat +import tempfile from collections.abc import Generator, Iterator, Sequence from contextlib import contextmanager from pathlib import Path, PurePath @@ -346,3 +347,44 @@ def resolve_absolute_path(path: str | Path) -> Path: - Handles Windows drive-relative paths and expands '~'. """ return Path(os.path.realpath(Path(path).expanduser())) + + +@contextmanager +def temp_file(suffix: str = "") -> Generator[str, None, None]: + """Create a temporary file, yield its path, and always delete it on exit.""" + fd, tmp_path = tempfile.mkstemp(suffix=suffix) + os.close(fd) + try: + yield tmp_path + finally: + try: + os.remove(tmp_path) + except OSError: + pass + + +def unique_parent_dirs(paths: list[str]) -> list[str]: + """Return the unique parent directories for a list of paths, preserving order. + + For each path that is a directory, the path itself is used. For a file, + its parent directory is used. Root-level files (no parent) are skipped. + Duplicates are removed while preserving the first-seen order. + """ + dirs: list[str] = [] + for path in paths: + if os.path.isdir(path): + dirs.append(path) + else: + parent = os.path.dirname(path) + if parent: + dirs.append(parent) + return list(dict.fromkeys(dirs)) + + +def move_directory_contents(src_dir: str, dest_dir: str) -> None: + """Move every entry in *src_dir* directly into *dest_dir*. + + Complements :func:`copy_directory_contents`. + """ + for entry in os.listdir(src_dir): + shutil.move(os.path.join(src_dir, entry), dest_dir) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index de68d062..16cada0c 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -4,7 +4,6 @@ import glob import os import re -import shutil import tempfile from collections.abc import Generator, Sequence from dataclasses import dataclass @@ -12,7 +11,14 @@ from dfetch.log import get_logger from dfetch.util.cmdline import SubprocessCommandError, run_on_cmdline -from dfetch.util.util import in_directory, is_license_file, safe_rm, strip_glob_prefix +from dfetch.util.util import ( + in_directory, + is_license_file, + move_directory_contents, + safe_rm, + strip_glob_prefix, + unique_parent_dirs, +) from dfetch.vcs.patch import Patch, PatchType logger = get_logger(__name__) @@ -350,35 +356,6 @@ def _apply_src_and_ignore( return [s for s in submodules if os.path.exists(s.path)] - @staticmethod - def _collect_source_dirs(matched_paths: list[str]) -> list[str]: - """Return unique parent directories for each matched path, preserving order.""" - dirs: list[str] = [] - for src_dir_path in matched_paths: - if os.path.isdir(src_dir_path): - dirs.append(src_dir_path) - else: - dir_path = os.path.dirname(src_dir_path) - if dir_path: - dirs.append(dir_path) - return list(dict.fromkeys(dirs)) - - @staticmethod - def _move_directory_contents(src_dir_path: str, remote: str) -> None: - """Move every file inside *src_dir_path* into the current working directory. - - After all files have been moved the top-level ancestor of *src_dir_path* - is removed. - """ - try: - for file_to_copy in os.listdir(src_dir_path): - shutil.move(src_dir_path + "/" + file_to_copy, ".") - safe_rm(PurePath(src_dir_path).parts[0]) - except FileNotFoundError: - logger.warning( - f"The 'src:' filter '{src_dir_path}' didn't match any files from '{remote}'" - ) - @staticmethod def _move_src_folder_up(remote: str, src: str) -> None: """Move the files from the src folder into the root of the project. @@ -395,16 +372,22 @@ def _move_src_folder_up(remote: str, src: str) -> None: ) return - unique_dirs = GitLocalRepo._collect_source_dirs(matched_paths) + dirs = unique_parent_dirs(matched_paths) - if len(unique_dirs) > 1: + if len(dirs) > 1: logger.warning( f"The 'src:' filter '{src}' matches multiple directories from '{remote}'. " - f"Only considering files in '{unique_dirs[0]}'." + f"Only considering files in '{dirs[0]}'." ) - if unique_dirs: - GitLocalRepo._move_directory_contents(unique_dirs[0], remote) + if dirs: + try: + move_directory_contents(dirs[0], ".") + safe_rm(PurePath(dirs[0]).parts[0]) + except FileNotFoundError: + logger.warning( + f"The 'src:' filter '{dirs[0]}' didn't match any files from '{remote}'" + ) @staticmethod def _determine_ignore_paths( diff --git a/tests/test_git_vcs.py b/tests/test_git_vcs.py index c76d63f8..ed346345 100644 --- a/tests/test_git_vcs.py +++ b/tests/test_git_vcs.py @@ -10,6 +10,7 @@ import pytest from dfetch.util.cmdline import SubprocessCommandError +from dfetch.util.util import unique_parent_dirs from dfetch.vcs.git import ( GitLocalRepo, GitRemote, @@ -17,7 +18,7 @@ ) # --------------------------------------------------------------------------- -# GitLocalRepo._collect_source_dirs +# unique_parent_dirs (dfetch.util.util) # --------------------------------------------------------------------------- @@ -32,9 +33,9 @@ ("deduplication preserves order", ["a/x.c", "a/y.c"], [False, False], ["a"]), ], ) -def test_collect_source_dirs(name, paths, isdir_results, expected): - with patch("dfetch.vcs.git.os.path.isdir", side_effect=isdir_results): - assert GitLocalRepo._collect_source_dirs(paths) == expected +def test_unique_parent_dirs(name, paths, isdir_results, expected): + with patch("dfetch.util.util.os.path.isdir", side_effect=isdir_results): + assert unique_parent_dirs(paths) == expected # --------------------------------------------------------------------------- From 4d43350bafc4b537f2b74d0f69520591b89948cb Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 22:06:11 +0000 Subject: [PATCH 6/9] fix(git): guard _move_src_folder_up against path-traversal in src filter move_directory_contents had no protection against absolute or traversal src patterns in dfetch.yaml. A malicious or misconfigured manifest entry such as `src: /etc` or `src: ../../sensitive` would cause shutil.move to relocate files from outside the checkout directory into the project destination before safe_rm's check_no_path_traversal could intervene on the deletion step. Two guards added to _move_src_folder_up: 1. Reject absolute src immediately (os.path.isabs) and log a warning. 2. For each path returned by glob.glob(src), resolve it and verify it is relative to the repo root (Path.is_relative_to). Out-of-root matches are logged and dropped; if no safe matches remain the function returns without calling move_directory_contents or safe_rm. Two regression tests added: - test_move_src_folder_up_rejects_absolute_src - test_move_src_folder_up_rejects_traversal_src https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/vcs/git.py | 20 +++++++++++++++++--- tests/test_git_vcs.py | 26 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index 16cada0c..b1067b54 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -364,15 +364,29 @@ def _move_src_folder_up(remote: str, src: str) -> None: remote (str): Name of the root src (str): Src folder to move up """ - matched_paths = sorted(glob.glob(src)) + if os.path.isabs(src): + logger.warning( + f"The 'src:' filter '{src}' is an absolute path; skipping for '{remote}'" + ) + return + + repo_root = Path(os.getcwd()).resolve() + safe_matched: list[str] = [] + for p in sorted(glob.glob(src)): + if Path(p).resolve().is_relative_to(repo_root): + safe_matched.append(p) + else: + logger.warning( + f"The 'src:' filter '{src}' matched '{p}' outside the repo root; skipping" + ) - if not matched_paths: + if not safe_matched: logger.warning( f"The 'src:' filter '{src}' didn't match any files from '{remote}'" ) return - dirs = unique_parent_dirs(matched_paths) + dirs = unique_parent_dirs(safe_matched) if len(dirs) > 1: logger.warning( diff --git a/tests/test_git_vcs.py b/tests/test_git_vcs.py index ed346345..d5ae3203 100644 --- a/tests/test_git_vcs.py +++ b/tests/test_git_vcs.py @@ -78,6 +78,32 @@ def test_build_ignore_args(name, ignore, expected): assert GitLocalRepo._build_ignore_args(ignore) == expected +# --------------------------------------------------------------------------- +# GitLocalRepo._move_src_folder_up — path-traversal guards +# --------------------------------------------------------------------------- + + +def test_move_src_folder_up_rejects_absolute_src(tmp_path): + """An absolute src pattern must be rejected without touching the filesystem.""" + with patch("dfetch.vcs.git.move_directory_contents") as mock_move: + with patch("dfetch.vcs.git.os.getcwd", return_value=str(tmp_path)): + GitLocalRepo._move_src_folder_up("my-remote", "/etc") + mock_move.assert_not_called() + + +def test_move_src_folder_up_rejects_traversal_src(tmp_path): + """A src pattern that resolves outside the repo root must be skipped.""" + outside = tmp_path.parent / "outside" + outside.mkdir() + (outside / "secret.txt").write_text("data") + + with patch("dfetch.vcs.git.move_directory_contents") as mock_move: + with patch("dfetch.vcs.git.glob.glob", return_value=[str(outside)]): + with patch("dfetch.vcs.git.os.getcwd", return_value=str(tmp_path)): + GitLocalRepo._move_src_folder_up("my-remote", "../outside") + mock_move.assert_not_called() + + @pytest.mark.parametrize( "name, cmd_result, expectation", [ From 9035c2d410ac92888dcff5fe839d9eacb807545f Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 22:40:18 +0000 Subject: [PATCH 7/9] fix(git): resolve dirs to canonical paths before move and cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unique_parent_dirs returns strings from os.path.dirname which does not normalise '..'. If safe_matched contained 'some/../inside/file.c' (legitimate — resolves within repo_root), dirs[0] would be the string 'some/../inside' and PurePath(...).parts[0] would yield 'some', causing safe_rm to delete the wrong top-level directory instead of 'inside'. Fix: resolve every entry returned by unique_parent_dirs to an absolute canonical Path immediately. Downstream steps then use: move_directory_contents(str(chosen), ".") top = repo_root / chosen.relative_to(repo_root).parts[0] safe_rm(top, within=repo_root) This ensures: - The path fed to move_directory_contents has no '..' components. - The cleanup target is the first component *under* repo_root, derived from the resolved path, not from a raw string split. - safe_rm receives an explicit within=repo_root rather than relying on the implicit within="." default. Also removes the now-unused PurePath import. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/vcs/git.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index b1067b54..87fbda69 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -7,7 +7,7 @@ import tempfile from collections.abc import Generator, Sequence from dataclasses import dataclass -from pathlib import Path, PurePath +from pathlib import Path from dfetch.log import get_logger from dfetch.util.cmdline import SubprocessCommandError, run_on_cmdline @@ -386,21 +386,25 @@ def _move_src_folder_up(remote: str, src: str) -> None: ) return - dirs = unique_parent_dirs(safe_matched) + # Resolve to canonical absolute paths so downstream steps use stable paths + # regardless of any '..' components in the original glob results. + resolved_dirs = [Path(d).resolve() for d in unique_parent_dirs(safe_matched)] - if len(dirs) > 1: + if len(resolved_dirs) > 1: logger.warning( f"The 'src:' filter '{src}' matches multiple directories from '{remote}'. " - f"Only considering files in '{dirs[0]}'." + f"Only considering files in '{resolved_dirs[0]}'." ) - if dirs: + if resolved_dirs: + chosen = resolved_dirs[0] try: - move_directory_contents(dirs[0], ".") - safe_rm(PurePath(dirs[0]).parts[0]) + move_directory_contents(str(chosen), ".") + top = repo_root / chosen.relative_to(repo_root).parts[0] + safe_rm(top, within=repo_root) except FileNotFoundError: logger.warning( - f"The 'src:' filter '{dirs[0]}' didn't match any files from '{remote}'" + f"The 'src:' filter '{chosen}' didn't match any files from '{remote}'" ) @staticmethod From 26bc60e93d51fc59d3c8aea39ad1082cef1ca752 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 26 Mar 2026 19:04:11 +0000 Subject: [PATCH 8/9] fix(tests): disable gpg signing in test repos; fix warning to use relative path The global git config had commit.gpgsign=true which caused all BDD test scenarios that create git repos to error with a signing server failure. Setting commit.gpgsign=false locally in create_repo() lets these tests run. Also corrects the 'Only considering files in' warning to show the relative path rather than the absolute resolved path, restoring the expected output for the multi-directory src filter scenario. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/vcs/git.py | 3 ++- features/steps/git_steps.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index 87fbda69..e5eb6df4 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -391,9 +391,10 @@ def _move_src_folder_up(remote: str, src: str) -> None: resolved_dirs = [Path(d).resolve() for d in unique_parent_dirs(safe_matched)] if len(resolved_dirs) > 1: + display = resolved_dirs[0].relative_to(repo_root) logger.warning( f"The 'src:' filter '{src}' matches multiple directories from '{remote}'. " - f"Only considering files in '{resolved_dirs[0]}'." + f"Only considering files in '{display}'." ) if resolved_dirs: diff --git a/features/steps/git_steps.py b/features/steps/git_steps.py index b0ea4a27..7254e07d 100644 --- a/features/steps/git_steps.py +++ b/features/steps/git_steps.py @@ -26,6 +26,7 @@ def create_repo(): subprocess.check_call(["git", "config", "user.email", "you@example.com"]) subprocess.check_call(["git", "config", "user.name", "John Doe"]) + subprocess.check_call(["git", "config", "commit.gpgsign", "false"]) if os.name == "nt": # Creates zombie fsmonitor-daemon process that holds files From 725149613fe0660a8b49266ebc81dfa5248b3299 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 26 Mar 2026 21:49:26 +0000 Subject: [PATCH 9/9] fix(git): guard against IndexError when chosen equals repo_root chosen.relative_to(repo_root).parts is empty when the resolved src directory is the repo root itself (e.g. src: .). Extracting parts into a variable and checking it before indexing prevents an IndexError and correctly skips the safe_rm step in that edge case. https://claude.ai/code/session_014ZRQGvyTdWBqoRjtu7gsSC --- dfetch/vcs/git.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dfetch/vcs/git.py b/dfetch/vcs/git.py index e5eb6df4..a9be2869 100644 --- a/dfetch/vcs/git.py +++ b/dfetch/vcs/git.py @@ -401,8 +401,9 @@ def _move_src_folder_up(remote: str, src: str) -> None: chosen = resolved_dirs[0] try: move_directory_contents(str(chosen), ".") - top = repo_root / chosen.relative_to(repo_root).parts[0] - safe_rm(top, within=repo_root) + parts = chosen.relative_to(repo_root).parts + if parts: + safe_rm(repo_root / parts[0], within=repo_root) except FileNotFoundError: logger.warning( f"The 'src:' filter '{chosen}' didn't match any files from '{remote}'"