From 6fd7a6366df53154906137cc38e7782495cc12ce Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 07:00:14 +0000 Subject: [PATCH 01/35] Add feature tests and manifest schema for archive (tar/zip) support (#23) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces BDD feature tests and manifest schema changes for fetching dependencies from tar/zip archives as described in issue #23. Manifest changes: - Extend vcs enum to include 'archive' alongside 'git' and 'svn' - Add optional 'hash:' field using 'algorithm:hex' format (sha256 only for now, designed for future extension to sha512 etc.) - Add 'hash' to ProjectEntry and ProjectEntryDict New feature tests: - features/fetch-archive.feature – fetch from tar.gz/zip, with/without hash verification, src:/ignore: filtering, force re-fetch, multiple archives - features/check-archive.feature – up-to-date check, unreachable URL, local change detection - features/freeze-archive.feature – freeze adds sha256 hash to manifest - features/validate-manifest.feature – valid archive manifest and invalid hash format validation scenarios New steps: - features/steps/archive_steps.py – creates local tar.gz/zip test archives, computes and stores sha256 in context for template substitution Test infrastructure: - manifest_steps.py: apply_manifest_substitutions() replaces 'some-remote-server' URLs and '' placeholder - generic_steps.py: check_output() substitutes '' in expected text before comparison https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/manifest/project.py | 8 ++ dfetch/manifest/schema.py | 5 +- features/check-archive.feature | 114 ++++++++++++++++ features/fetch-archive.feature | 205 +++++++++++++++++++++++++++++ features/freeze-archive.feature | 62 +++++++++ features/steps/archive_steps.py | 70 ++++++++++ features/steps/generic_steps.py | 6 +- features/steps/manifest_steps.py | 16 ++- features/validate-manifest.feature | 45 +++++++ 9 files changed, 525 insertions(+), 6 deletions(-) create mode 100644 features/check-archive.feature create mode 100644 features/fetch-archive.feature create mode 100644 features/freeze-archive.feature create mode 100644 features/steps/archive_steps.py diff --git a/dfetch/manifest/project.py b/dfetch/manifest/project.py index 2d66a136..9a1ab7b1 100644 --- a/dfetch/manifest/project.py +++ b/dfetch/manifest/project.py @@ -300,6 +300,7 @@ "repo-path": str, "vcs": str, "ignore": Sequence[str], + "hash": str, "default_remote": str, }, total=False, @@ -327,6 +328,7 @@ def __init__(self, kwargs: ProjectEntryDict) -> None: self._tag: str = kwargs.get("tag", "") self._vcs: str = kwargs.get("vcs", "") self._ignore: Sequence[str] = kwargs.get("ignore", []) + self._hash: str = kwargs.get("hash", "") if not self._remote and not self._url: self._remote = kwargs.get("default_remote", "") @@ -443,6 +445,11 @@ def ignore(self) -> Sequence[str]: """Get the list of files/folders to ignore from this project (relative to src).""" return self._ignore + @property + def hash(self) -> str: + """Get the expected hash of the archive (format: 'algorithm:hex-value').""" + return self._hash + def __repr__(self) -> str: """Get a string representation of this project entry.""" version = ( @@ -477,6 +484,7 @@ def as_yaml(self) -> dict[str, str | list[str]]: "tag": self._tag, "repo-path": self._repo_path, "vcs": self._vcs, + "hash": self._hash, } return {k: v for k, v in yamldata.items() if v} diff --git a/dfetch/manifest/schema.py b/dfetch/manifest/schema.py index 823b63ce..8a9e958c 100644 --- a/dfetch/manifest/schema.py +++ b/dfetch/manifest/schema.py @@ -15,6 +15,8 @@ } ) +HASH_STR = Regex(r"^(sha256):[a-fA-F0-9]+$") + PROJECT_SCHEMA = Map( { "name": SAFE_STR, @@ -26,9 +28,10 @@ Optional("repo-path"): SAFE_STR, Optional("remote"): SAFE_STR, Optional("patch"): SAFE_STR | Seq(SAFE_STR), - Optional("vcs"): Enum(["git", "svn"]), + Optional("vcs"): Enum(["git", "svn", "archive"]), Optional("src"): SAFE_STR, Optional("ignore"): Seq(SAFE_STR), + Optional("hash"): HASH_STR, } ) diff --git a/features/check-archive.feature b/features/check-archive.feature new file mode 100644 index 00000000..bba9b3dc --- /dev/null +++ b/features/check-archive.feature @@ -0,0 +1,114 @@ +Feature: Checking dependencies from an archive + + DFetch can check if archive-based projects are up-to-date. + For archives without a hash, the URL is used as the version identifier. + For archives with a 'hash:' field, the hash is verified against the + downloaded archive to determine if the content matches. + + Scenario: Archive project without hash is reported as up-to-date after fetch + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + And all projects are updated in MyProject + When I run "dfetch check" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > up-to-date (some-remote-server/SomeProject.tar.gz) + """ + + Scenario: Archive project with correct sha256 hash is reported as up-to-date + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + """ + And all projects are updated in MyProject + When I run "dfetch check" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > up-to-date (sha256:) + """ + + Scenario: Archive project that has not been fetched yet is reported + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + When I run "dfetch check" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > wanted (some-remote-server/SomeProject.tar.gz), but never fetched! + """ + + Scenario: Non-existent archive URL is reported + Given the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + projects: + - name: non-existent-archive + url: https://example.com/does-not-exist.tar.gz + vcs: archive + """ + When I run "dfetch check" + Then the output shows + """ + Dfetch (0.12.1) + non-existent-archive: + > 'https://example.com/does-not-exist.tar.gz' is not a valid URL or unreachable + """ + + Scenario: Archive with local changes is reported + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + And all projects are updated in MyProject + And "SomeProject/README.md" in MyProject is changed locally + When I run "dfetch check SomeProject" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > Local changes were detected, please generate a patch using 'dfetch diff SomeProject' and add it to your manifest using 'patch:'. Alternatively overwrite the local changes with 'dfetch update --force SomeProject' + > up-to-date (some-remote-server/SomeProject.tar.gz) + """ diff --git a/features/fetch-archive.feature b/features/fetch-archive.feature new file mode 100644 index 00000000..2f5bdd2c --- /dev/null +++ b/features/fetch-archive.feature @@ -0,0 +1,205 @@ +Feature: Fetching dependencies from an archive (tar/zip) + + Some projects are distributed as tar or zip archives, for example as GitHub + release assets or on internal artifact servers. DFetch supports fetching + these archives using the 'archive' vcs type. Optionally, a hash can be + specified for integrity verification using 'hash: :'. + + Scenario: Tar.gz archive project is fetched + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + | src/main.c | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + When I run "dfetch update" in MyProject + Then 'MyProject' looks like: + """ + MyProject/ + SomeProject/ + .dfetch_data.yaml + README.md + src/ + main.c + dfetch.yaml + """ + + Scenario: Zip archive project is fetched + Given an archive "SomeProject.zip" with the files + | path | + | README.md | + | include/lib.h | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.zip + vcs: archive + """ + When I run "dfetch update" in MyProject + Then 'MyProject' looks like: + """ + MyProject/ + SomeProject/ + .dfetch_data.yaml + README.md + include/ + lib.h + dfetch.yaml + """ + + Scenario: Archive project with sha256 hash verification is fetched + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + """ + When I run "dfetch update" in MyProject + Then the following projects are fetched + | path | + | SomeProject | + + Scenario: Archive with incorrect sha256 hash is rejected + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256:0000000000000000000000000000000000000000000000000000000000000000 + """ + When I run "dfetch update" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > Hash mismatch for SomeProject! sha256 expected 0000000000000000000000000000000000000000000000000000000000000000 + """ + + Scenario: Specific directory from archive can be fetched + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + | src/main.c | + | src/lib.c | + | tests/test_main.c | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + src: src/ + """ + When I run "dfetch update" in MyProject + Then 'MyProject' looks like: + """ + MyProject/ + SomeProject/ + .dfetch_data.yaml + README.md + lib.c + main.c + dfetch.yaml + """ + + Scenario: Files can be ignored when fetching from archive + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + | src/main.c | + | tests/test_main.c | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + ignore: + - tests + """ + When I run "dfetch update" in MyProject + Then 'MyProject' looks like: + """ + MyProject/ + SomeProject/ + .dfetch_data.yaml + README.md + src/ + main.c + dfetch.yaml + """ + + Scenario: Archive is re-fetched when force flag is given + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + And all projects are updated in MyProject + When I run "dfetch update --force" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > Fetched some-remote-server/SomeProject.tar.gz + """ + + Scenario: Multiple archive projects are fetched + Given an archive "LibA.tar.gz" with the files + | path | + | README.md | + And an archive "LibB.zip" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: LibA + url: some-remote-server/LibA.tar.gz + vcs: archive + + - name: LibB + url: some-remote-server/LibB.zip + vcs: archive + """ + When I run "dfetch update" in MyProject + Then the following projects are fetched + | path | + | LibA | + | LibB | diff --git a/features/freeze-archive.feature b/features/freeze-archive.feature new file mode 100644 index 00000000..baa9584c --- /dev/null +++ b/features/freeze-archive.feature @@ -0,0 +1,62 @@ +Feature: Freeze archive dependencies + + For archive projects, 'dfetch freeze' adds a sha256 hash to the manifest + to pin the exact archive content. This uses the 'hash: sha256:' + format, which can be extended to other algorithms in the future. + + Scenario: Archive project is frozen with its sha256 hash + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + And all projects are updated in MyProject + When I run "dfetch freeze" in MyProject + Then the manifest 'dfetch.yaml' is replaced with + """ + manifest: + version: '0.0' + + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + + """ + + Scenario: Already frozen archive project is not changed by freeze + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + """ + And all projects are updated in MyProject + When I run "dfetch freeze" in MyProject + Then the manifest 'dfetch.yaml' is replaced with + """ + manifest: + version: '0.0' + + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + + """ diff --git a/features/steps/archive_steps.py b/features/steps/archive_steps.py new file mode 100644 index 00000000..aa5ee229 --- /dev/null +++ b/features/steps/archive_steps.py @@ -0,0 +1,70 @@ +"""Steps for archive-based feature tests.""" + +# pylint: disable=function-redefined, missing-function-docstring, import-error, not-callable +# pyright: reportRedeclaration=false, reportAttributeAccessIssue=false, reportCallIssue=false + +import hashlib +import io +import os +import pathlib +import tarfile +import zipfile + +from behave import given # pylint: disable=no-name-in-module + +from dfetch.util.util import in_directory + + +def compute_sha256(path: str) -> str: + """Compute the SHA-256 hash of a file.""" + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + return h.hexdigest() + + +def create_tar_gz(archive_path: str, name: str, files: list[dict]) -> None: + """Create a .tar.gz archive with files nested under a top-level / directory.""" + with tarfile.open(archive_path, "w:gz") as tar: + for file in files: + content = f"Generated file {file['path']}\n".encode() + member_path = f"{name}/{file['path']}" + info = tarfile.TarInfo(name=member_path) + info.size = len(content) + tar.addfile(info, io.BytesIO(content)) + + +def create_zip(archive_path: str, name: str, files: list[dict]) -> None: + """Create a .zip archive with files nested under a top-level / directory.""" + with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as zf: + for file in files: + content = f"Generated file {file['path']}\n" + member_path = f"{name}/{file['path']}" + zf.writestr(member_path, content) + + +@given('an archive "{name}.tar.gz" with the files') +@given('an archive "{name}.tar.gz"') +def step_impl(context, name): + server_path = context.remotes_dir_path + pathlib.Path(server_path).mkdir(parents=True, exist_ok=True) + + archive_path = os.path.join(server_path, f"{name}.tar.gz") + files = list(context.table) if context.table else [{"path": "README.md"}] + create_tar_gz(archive_path, name, files) + + context.archive_sha256 = compute_sha256(archive_path) + + +@given('an archive "{name}.zip" with the files') +@given('an archive "{name}.zip"') +def step_impl(context, name): + server_path = context.remotes_dir_path + pathlib.Path(server_path).mkdir(parents=True, exist_ok=True) + + archive_path = os.path.join(server_path, f"{name}.zip") + files = list(context.table) if context.table else [{"path": "README.md"}] + create_zip(archive_path, name, files) + + context.archive_sha256 = compute_sha256(archive_path) diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 8d35a44f..66494a02 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -196,6 +196,10 @@ def check_output(context, line_count=None): context: Behave context with cmd_output and expected text line_count: If set, compare only the first N lines of actual output """ + expected_raw = context.text + if hasattr(context, "archive_sha256"): + expected_raw = expected_raw.replace("", context.archive_sha256) + expected_text = multisub( patterns=[ (dfetch_title, "Dfetch (x.x.x)"), @@ -204,7 +208,7 @@ def check_output(context, line_count=None): (ansi_escape, ""), (svn_error, "svn: EXXXXXX: "), ], - text=context.text, + text=expected_raw, ) actual_text = multisub( diff --git a/features/steps/manifest_steps.py b/features/steps/manifest_steps.py index 13641e22..ce2873c9 100644 --- a/features/steps/manifest_steps.py +++ b/features/steps/manifest_steps.py @@ -12,13 +12,21 @@ from features.steps.generic_steps import check_file, generate_file, remote_server_path +def apply_manifest_substitutions(context, contents: str) -> str: + """Apply context-specific substitutions to manifest contents.""" + result = contents.replace( + "url: some-remote-server", f"url: file:///{remote_server_path(context)}" + ) + if hasattr(context, "archive_sha256"): + result = result.replace("", context.archive_sha256) + return result + + def generate_manifest( context, name="dfetch.yaml", contents: Optional[str] = None, path=None ): contents = contents or context.text - manifest = contents.replace( - "url: some-remote-server", f"url: file:///{remote_server_path(context)}" - ) + manifest = apply_manifest_substitutions(context, contents) generate_file(os.path.join(path or os.getcwd(), name), manifest) @@ -37,7 +45,7 @@ def step_impl(context, name, path=None): @then("it should generate the manifest '{name}'") def step_impl(context, name): """Check a manifest.""" - check_file(name, context.text) + check_file(name, apply_manifest_substitutions(context, context.text)) @given("the manifest '{name}' with the projects:") diff --git a/features/validate-manifest.feature b/features/validate-manifest.feature index dd32b95a..bceb0e89 100644 --- a/features/validate-manifest.feature +++ b/features/validate-manifest.feature @@ -51,6 +51,51 @@ Feature: Validate a manifest unexpected key not in schema 'manifest-wrong' """ + Scenario: A valid archive manifest with hash is validated + Given the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + + projects: + - name: SomeLib + url: https://example.com/SomeLib-1.0.tar.gz + vcs: archive + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + + """ + When I run "dfetch validate" + Then the output shows + """ + Dfetch (0.12.1) + dfetch.yaml : valid + """ + + Scenario: A manifest with an invalid hash format is rejected + Given the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + + projects: + - name: SomeLib + url: https://example.com/SomeLib-1.0.tar.gz + vcs: archive + hash: not-a-valid-hash + + """ + When I run "dfetch validate" + Then the output shows + """ + Dfetch (0.12.1) + Schema validation failed: + + hash: not-a-valid-hash + ^ (line: 9) + + when expecting a string matching ^(sha256):[a-fA-F0-9]+$ + """ + Scenario: A manifest with duplicate project names Given the manifest 'dfetch.yaml' """ From c6f80190da6f5cbd1879f9b682110d3b1af890dc Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 07:57:54 +0000 Subject: [PATCH 02/35] Implement archive (tar/zip) VCS type with hash verification (#23) Add full support for fetching dependencies distributed as tar.gz, tgz, tar.bz2, tar.xz or zip archives. Archives can be pinned with a sha256 hash for integrity verification using `hash: sha256:`. - dfetch/vcs/archive.py: ArchiveRemote (HEAD check + download via urllib) and ArchiveLocalRepo (extract, strip top-level dir, apply src:/ignore: filtering) - dfetch/project/archivesubproject.py: ArchiveSubProject implementing the SubProject interface; wanted_version returns the sha256 hash (if set) or remote URL so the metadata / check logic works correctly - dfetch/project/subproject.py: add freeze_project() method that ArchiveSubProject overrides to write project.hash instead of project.version - dfetch/project/__init__.py: register ArchiveSubProject as first candidate so explicit `vcs: archive` is matched first - dfetch/manifest/project.py: add hash field to ProjectEntry and ProjectEntryDict; extend Archive/Hash docstring with code examples - dfetch/manifest/schema.py (from previous commit): vcs enum extended to include archive; HASH_STR regex validates sha256: format - dfetch/commands/freeze.py: delegate to sub_project.freeze_project(); add Archive scenario-include to docstring - dfetch/commands/update.py / check.py: add Archive tab to docstring - doc/manifest.rst: add archive to vcs enum and hash field in schema - features/fetch-archive.feature: 8 BDD scenarios (tar.gz, zip, hash verify, hash mismatch, src:, ignore:, force, multiple archives) - features/check-archive.feature: 5 BDD scenarios - features/freeze-archive.feature: 2 BDD scenarios https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/commands/check.py | 4 + dfetch/commands/freeze.py | 39 +++-- dfetch/commands/update.py | 4 + dfetch/manifest/project.py | 45 ++++++ dfetch/project/__init__.py | 3 +- dfetch/project/archivesubproject.py | 232 +++++++++++++++++++++++++++ dfetch/project/subproject.py | 20 +++ dfetch/vcs/archive.py | 239 ++++++++++++++++++++++++++++ doc/manifest.rst | 9 ++ features/check-archive.feature | 14 +- features/fetch-archive.feature | 17 +- features/freeze-archive.feature | 2 + 12 files changed, 595 insertions(+), 33 deletions(-) create mode 100644 dfetch/project/archivesubproject.py create mode 100644 dfetch/vcs/archive.py diff --git a/dfetch/commands/check.py b/dfetch/commands/check.py index e338fbb3..b42c59a0 100644 --- a/dfetch/commands/check.py +++ b/dfetch/commands/check.py @@ -15,6 +15,10 @@ .. scenario-include:: ../features/check-svn-repo.feature + .. tab:: Archive + + .. scenario-include:: ../features/check-archive.feature + Sub-manifests ~~~~~~~~~~~~~ diff --git a/dfetch/commands/freeze.py b/dfetch/commands/freeze.py index c3e38137..b10e4a92 100644 --- a/dfetch/commands/freeze.py +++ b/dfetch/commands/freeze.py @@ -36,6 +36,13 @@ .. scenario-include:: ../features/freeze-projects.feature +For archive projects, ``dfetch freeze`` adds a ``hash: sha256:`` field +to pin the exact archive content used. This field acts as the version +identifier: DFetch verifies the downloaded archive against it on every +subsequent ``dfetch update``. + +.. scenario-include:: ../features/freeze-archive.feature + """ import argparse @@ -78,24 +85,24 @@ def __call__(self, args: argparse.Namespace) -> None: with in_directory(superproject.root_directory): for project in superproject.manifest.projects: with catch_runtime_exceptions(exceptions) as exceptions: - on_disk_version = dfetch.project.create_sub_project( - project - ).on_disk_version() - - if project.version == on_disk_version: - logger.print_info_line( - project.name, - f"Already pinned in manifest on version {project.version}", - ) - elif on_disk_version: - logger.print_info_line( - project.name, f"Freezing on version {on_disk_version}" - ) - project.version = on_disk_version + sub_project = dfetch.project.create_sub_project(project) + on_disk_version = sub_project.on_disk_version() + + if not sub_project.freeze_project(project): + if on_disk_version: + logger.print_info_line( + project.name, + f"Already pinned in manifest on version {on_disk_version}", + ) + else: + logger.print_warning_line( + project.name, + "No version on disk, first update with 'dfetch update'", + ) else: - logger.print_warning_line( + logger.print_info_line( project.name, - "No version on disk, first update with 'dfetch update'", + f"Freezing on version {on_disk_version}", ) projects.append(project) diff --git a/dfetch/commands/update.py b/dfetch/commands/update.py index ab1ddd35..6d108607 100644 --- a/dfetch/commands/update.py +++ b/dfetch/commands/update.py @@ -15,6 +15,10 @@ .. scenario-include:: ../features/fetch-svn-repo.feature + .. tab:: Archive + + .. scenario-include:: ../features/fetch-archive.feature + Sub-manifests ~~~~~~~~~~~~~~~ diff --git a/dfetch/manifest/project.py b/dfetch/manifest/project.py index 9a1ab7b1..7d144a0f 100644 --- a/dfetch/manifest/project.py +++ b/dfetch/manifest/project.py @@ -221,6 +221,46 @@ vcs: svn repo-path: cpputest/cpputest +Archive +####### +Projects distributed as ``.tar.gz``, ``.tgz``, ``.tar.bz2``, ``.tar.xz`` or ``.zip`` archive files +can be fetched using ``vcs: archive``. DFetch downloads the archive from the ``url:`` and extracts +it to the destination directory, stripping the top-level directory if present. + +The ``src:`` and ``ignore:`` attributes work the same way as for git/SVN projects. + +.. code-block:: yaml + + manifest: + version: 0.0 + + projects: + - name: my-library + vcs: archive + url: https://example.com/releases/my-library-1.0.tar.gz + +Hash verification +***************** +Use the ``hash:`` attribute to verify the integrity of the downloaded archive. +The format is ``:``. Only ``sha256`` is supported +today; the format is designed to be extended (``sha512``, etc.). + +.. code-block:: yaml + + manifest: + version: 0.0 + + projects: + - name: my-library + vcs: archive + url: https://example.com/releases/my-library-1.0.tar.gz + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + +Run ``dfetch freeze`` after an initial ``dfetch update`` to add the sha256 hash to +the manifest automatically. + +.. scenario-include:: ../features/fetch-archive.feature + Patch ##### *DFetch* promotes upstreaming changes, but also allows local changes. These changes can be managed with local patch @@ -450,6 +490,11 @@ def hash(self) -> str: """Get the expected hash of the archive (format: 'algorithm:hex-value').""" return self._hash + @hash.setter + def hash(self, value: str) -> None: + """Set the expected hash of the archive.""" + self._hash = value + def __repr__(self) -> str: """Get a string representation of this project entry.""" version = ( diff --git a/dfetch/project/__init__.py b/dfetch/project/__init__.py index 57dfde14..96191220 100644 --- a/dfetch/project/__init__.py +++ b/dfetch/project/__init__.py @@ -6,6 +6,7 @@ import dfetch.manifest.project from dfetch.log import get_logger from dfetch.manifest.parse import find_manifest, parse +from dfetch.project.archivesubproject import ArchiveSubProject from dfetch.project.gitsubproject import GitSubProject from dfetch.project.gitsuperproject import GitSuperProject from dfetch.project.subproject import SubProject @@ -14,7 +15,7 @@ from dfetch.project.svnsuperproject import SvnSuperProject from dfetch.util.util import resolve_absolute_path -SUPPORTED_SUBPROJECT_TYPES = [GitSubProject, SvnSubProject] +SUPPORTED_SUBPROJECT_TYPES = [ArchiveSubProject, GitSubProject, SvnSubProject] SUPPORTED_SUPERPROJECT_TYPES = [GitSuperProject, SvnSuperProject] logger = get_logger(__name__) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py new file mode 100644 index 00000000..fb32d9b0 --- /dev/null +++ b/dfetch/project/archivesubproject.py @@ -0,0 +1,232 @@ +"""Archive (tar/zip) specific implementation. + +Archives are a third VCS type alongside ``git`` and ``svn``. They represent +versioned dependencies that are distributed as ``.tar.gz``, ``.tgz``, +``.tar.bz2``, ``.tar.xz`` or ``.zip`` files reachable via any URL that Python's +:mod:`urllib.request` understands (``http://``, ``https://``, ``file://``, …). + +Unlike git and SVN, archives have no inherent "branching" or "tagging" +concept. Version identity is expressed through: + +* **No hash** – the URL itself acts as the identity. The archive is + considered up-to-date as long as the same URL is still reachable. +* **``hash: :``** – the cryptographic hash of the archive file + acts as the version identifier. The fetch step verifies the downloaded + archive against this hash and raises an error on mismatch. + +The ``hash:`` field is intended to be extended to additional algorithms in +the future; only ``sha256`` is supported today. + +Example manifest entries:: + + projects: + # URL-pinned (no integrity check) + - name: my-headers + url: https://example.com/my-headers-latest.tar.gz + vcs: archive + + # Hash-pinned (integrity verified on every fetch) + - name: my-library + url: https://example.com/releases/my-library-1.0.tar.gz + vcs: archive + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + +.. scenario-include:: ../features/fetch-archive.feature +""" + +from __future__ import annotations + +import pathlib +import tempfile + +from dfetch.log import get_logger +from dfetch.manifest.project import ProjectEntry +from dfetch.manifest.version import Version +from dfetch.project.subproject import SubProject +from dfetch.vcs.archive import ( + SUPPORTED_HASH_ALGORITHMS, + ArchiveLocalRepo, + ArchiveRemote, + compute_hash, + is_archive_url, + _suffix_for_url, +) + +logger = get_logger(__name__) + + +class ArchiveSubProject(SubProject): + """A project fetched from a tar/zip archive URL. + + Supports ``src:`` (sub-path extraction), ``ignore:`` (file exclusion) and + ``patch:`` (local patches applied after every fetch) in the same way as + the git and SVN implementations. + """ + + NAME = "archive" + + def __init__(self, project: ProjectEntry) -> None: + """Create an ArchiveSubProject.""" + super().__init__(project) + self._project_entry = project + self._remote_repo = ArchiveRemote(project.remote_url) + + # ------------------------------------------------------------------ + # SubProject abstract interface + # ------------------------------------------------------------------ + + def check(self) -> bool: + """Return *True* when the project URL looks like an archive.""" + return is_archive_url(self.remote) + + @staticmethod + def revision_is_enough() -> bool: + """Archives are uniquely identified by their hash (or URL), so yes.""" + return True + + @staticmethod + def list_tool_info() -> None: + """Log information about the archive fetching tool (Python's urllib).""" + import urllib.request as _ur # noqa: PLC0415 + + SubProject._log_tool("urllib", _ur.__doc__ or "built-in") + + def get_default_branch(self) -> str: + """Archives have no branches; return an empty string.""" + return "" + + def _latest_revision_on_branch(self, branch: str) -> str: # noqa: ARG002 + """For archives the 'latest revision' is always the URL (or hash).""" + return self._project_entry.remote_url + + def _does_revision_exist(self, revision: str) -> bool: + """Check whether *revision* (a hash or URL string) is still valid. + + * If *revision* starts with a known hash algorithm prefix (e.g. + ``sha256:``) the archive is downloaded and the hash verified. + * Otherwise *revision* is treated as the URL itself and a lightweight + reachability check is performed. + """ + for algo in SUPPORTED_HASH_ALGORITHMS: + if revision.startswith(f"{algo}:"): + expected_hex = revision.split(":", 1)[1] + try: + with tempfile.NamedTemporaryFile( + suffix=_suffix_for_url(self.remote), delete=False + ) as tmp: + self._remote_repo.download(tmp.name) + actual = compute_hash(tmp.name, algo) + return actual == expected_hex + except RuntimeError: + return False + + # revision is the URL – just check accessibility + return self._remote_repo.is_accessible() + + def _list_of_tags(self) -> list[str]: + """Archives have no tags; returns an empty list.""" + return [] + + # ------------------------------------------------------------------ + # Version overrides + # ------------------------------------------------------------------ + + @property + def wanted_version(self) -> Version: + """Version derived from the ``hash:`` field or the archive URL. + + * With ``hash: sha256:`` → ``Version(revision='sha256:')`` + * Without hash → ``Version(revision=)`` + + This makes the standard :class:`~dfetch.project.subproject.SubProject` + comparison machinery work transparently for archives. + """ + if self._project_entry.hash: + return Version(revision=self._project_entry.hash) + return Version(revision=self._project_entry.remote_url) + + # ------------------------------------------------------------------ + # Fetch + # ------------------------------------------------------------------ + + def _fetch_impl(self, version: Version) -> Version: + """Download and extract the archive to the local destination. + + 1. Download the archive to a temporary file. + 2. If ``hash:`` is specified, verify the downloaded file. + 3. Extract to :attr:`local_path`, respecting ``src:`` and ``ignore:``. + + Raises: + RuntimeError: On download failure or hash mismatch. + + Returns: + The version that was actually fetched (hash string or URL). + """ + url = self._project_entry.remote_url + expected_hash = self._project_entry.hash + + pathlib.Path(self.local_path).mkdir(parents=True, exist_ok=True) + + suffix = _suffix_for_url(url) + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp_path = tmp.name + + try: + self._remote_repo.download(tmp_path) + + if expected_hash: + algorithm, expected_hex = expected_hash.split(":", 1) + actual_hex = compute_hash(tmp_path, algorithm) + if actual_hex != expected_hex: + raise RuntimeError( + f"Hash mismatch for {self._project_entry.name}! " + f"{algorithm} expected {expected_hex}" + ) + + ArchiveLocalRepo.extract( + tmp_path, + self.local_path, + src=self.source, + ignore=self.ignore, + ) + finally: + try: + import os + + os.remove(tmp_path) + except OSError: + pass + + if expected_hash: + return Version(revision=expected_hash) + return Version(revision=url) + + # ------------------------------------------------------------------ + # Freeze support + # ------------------------------------------------------------------ + + def freeze_project(self, project: ProjectEntry) -> bool: + """Update *project* with the on-disk hash so the manifest is pinned. + + For archives without a hash field this is a no-op (the URL is the + version identifier and does not change). + + Returns: + *True* when the manifest entry was modified, *False* otherwise. + """ + on_disk = self.on_disk_version() + if not on_disk: + return False + + revision = on_disk.revision + if not revision.startswith(tuple(f"{a}:" for a in SUPPORTED_HASH_ALGORITHMS)): + # Archive without a hash – nothing to freeze beyond the URL + if project.hash == revision: + return False + return False + + if project.hash == revision: + return False + + project.hash = revision + return True diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index 20f685e8..6d5fc5a7 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -388,6 +388,26 @@ def _fetch_impl(self, version: Version) -> Version: def get_default_branch(self) -> str: """Get the default branch of this repository.""" + def freeze_project(self, project: ProjectEntry) -> bool: + """Freeze *project* to its current on-disk version. + + Subclasses may override this to apply VCS-specific freeze logic (e.g. + :class:`~dfetch.project.archivesubproject.ArchiveSubProject` stores + the hash in the ``hash:`` field rather than ``revision:``). + + Returns: + *True* when the manifest entry was modified, *False* if the entry + was already pinned to the on-disk version or no on-disk version + could be determined. + """ + on_disk_version = self.on_disk_version() + if project.version == on_disk_version: + return False + if on_disk_version: + project.version = on_disk_version + return True + return False + @staticmethod def is_license_file(filename: str) -> bool: """Check if the given filename is a license file.""" diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py new file mode 100644 index 00000000..4d2049f6 --- /dev/null +++ b/dfetch/vcs/archive.py @@ -0,0 +1,239 @@ +"""Archive (tar/zip) VCS implementation. + +Supports fetching dependencies distributed as ``.tar.gz``, ``.tgz``, +``.tar.bz2``, ``.tar.xz`` or ``.zip`` archives from any URL that Python's +:mod:`urllib.request` can reach (``http://``, ``https://``, ``file://``, …). + +Optional integrity checking is supported via a ``hash:`` manifest field +(e.g. ``hash: sha256:``). The ``sha256`` algorithm is supported today; +the format is designed for extension to ``sha512``, ``md5``, etc. + +Example manifest entry:: + + projects: + - name: my-library + url: https://example.com/releases/my-library-1.0.tar.gz + vcs: archive + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + +""" + +from __future__ import annotations + +import hashlib +import io +import os +import pathlib +import shutil +import tarfile +import tempfile +import urllib.error +import urllib.request +import zipfile +from collections.abc import Sequence + +from dfetch.log import get_logger +from dfetch.project.subproject import SubProject +from dfetch.util.util import find_matching_files, safe_rm + +logger = get_logger(__name__) + +#: Archive file extensions recognised by DFetch. +ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") + +#: Hash algorithms supported by the ``hash:`` manifest field. +SUPPORTED_HASH_ALGORITHMS = ("sha256",) + + +def is_archive_url(url: str) -> bool: + """Return *True* when *url* ends with a recognised archive extension.""" + return any(url.lower().endswith(ext) for ext in ARCHIVE_EXTENSIONS) + + +def compute_hash(path: str, algorithm: str = "sha256") -> str: + """Compute the hex digest of *path* using *algorithm*. + + Args: + path: Path to the file. + algorithm: Hash algorithm name (e.g. ``"sha256"``). + + Returns: + Lowercase hex digest string. + + Raises: + RuntimeError: When *algorithm* is not supported. + """ + if algorithm not in SUPPORTED_HASH_ALGORITHMS: + raise RuntimeError( + f"Unsupported hash algorithm '{algorithm}'. " + f"Supported: {', '.join(SUPPORTED_HASH_ALGORITHMS)}" + ) + h = hashlib.new(algorithm) + with open(path, "rb") as fh: + for chunk in iter(lambda: fh.read(65536), b""): + h.update(chunk) + return h.hexdigest() + + +class ArchiveRemote: + """Represents a remote archive (tar/zip) URL. + + Provides helpers to check accessibility and download the archive. + """ + + def __init__(self, url: str) -> None: + """Create an ArchiveRemote for *url*.""" + self.url = url + + def is_accessible(self) -> bool: + """Return *True* when the archive URL is reachable. + + Sends a lightweight ``HEAD`` request for ``http``/``https`` URLs and + tests existence for ``file://`` URLs. Returns *False* on any network + or I/O error. + """ + try: + parsed = urllib.request.Request(self.url, method="HEAD") + with urllib.request.urlopen(parsed, timeout=15): + return True + except (urllib.error.URLError, OSError, ValueError): + return False + + def download(self, dest_path: str) -> None: + """Download the archive to *dest_path*. + + Args: + dest_path: Local file path to write the archive to. + + Raises: + RuntimeError: On download failure. + """ + try: + urllib.request.urlretrieve(self.url, dest_path) + except (urllib.error.URLError, OSError) as exc: + raise RuntimeError( + f"'{self.url}' is not a valid URL or unreachable: {exc}" + ) from exc + + +class ArchiveLocalRepo: + """Extracts an archive to a local destination directory. + + Supports ``.tar.gz``, ``.tgz``, ``.tar.bz2``, ``.tar.xz`` and ``.zip`` + archives. A single top-level directory in the archive is automatically + stripped (like ``tar --strip-components=1``), so the archive may be + structured as ``project-1.0/src/…`` or ``src/…`` – both work. + """ + + @staticmethod + def extract( + archive_path: str, + dest_dir: str, + src: str = "", + ignore: Sequence[str] = (), + is_license: bool = True, + ) -> None: + """Extract *archive_path* into *dest_dir*, applying *src* / *ignore* filters. + + Args: + archive_path: Path to the downloaded archive file. + dest_dir: Directory to place the extracted contents into. + src: Optional sub-directory (or glob pattern) inside the archive + to extract exclusively. License files from the archive root + are always retained when *src* is set. + ignore: Sequence of glob patterns for files/directories to exclude. + is_license: Whether to check for and retain license files when + *src* is specified. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + ArchiveLocalRepo._extract_raw(archive_path, tmp_dir) + + # Strip a single top-level directory if the archive uses one + entries = os.listdir(tmp_dir) + if len(entries) == 1 and os.path.isdir( + os.path.join(tmp_dir, entries[0]) + ): + extract_root = os.path.join(tmp_dir, entries[0]) + else: + extract_root = tmp_dir + + pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True) + + if src: + ArchiveLocalRepo._copy_with_src( + extract_root, dest_dir, src.rstrip("/"), is_license + ) + else: + ArchiveLocalRepo._copy_all(extract_root, dest_dir) + + if ignore: + ArchiveLocalRepo._apply_ignore(dest_dir, ignore) + + @staticmethod + def _extract_raw(archive_path: str, dest_dir: str) -> None: + """Extract archive contents to *dest_dir* without any filtering.""" + lower = archive_path.lower() + if tarfile.is_tarfile(archive_path) and not lower.endswith(".zip"): + with tarfile.open(archive_path, "r:*") as tf: + tf.extractall(dest_dir, filter="tar") + elif lower.endswith(".zip") or zipfile.is_zipfile(archive_path): + with zipfile.ZipFile(archive_path) as zf: + zf.extractall(dest_dir) + else: + raise RuntimeError( + f"Unsupported archive format: '{archive_path}'. " + f"Supported formats: {', '.join(ARCHIVE_EXTENSIONS)}" + ) + + @staticmethod + def _copy_with_src( + extract_root: str, dest_dir: str, src: str, keep_licenses: bool + ) -> None: + """Copy only *src* sub-directory contents (and optionally licenses) to *dest_dir*.""" + src_path = os.path.join(extract_root, src) + + if os.path.isdir(src_path): + for item in os.listdir(src_path): + s = os.path.join(src_path, item) + d = os.path.join(dest_dir, item) + if os.path.isdir(s): + shutil.copytree(s, d) + else: + shutil.copy2(s, d) + elif os.path.isfile(src_path): + shutil.copy2(src_path, os.path.join(dest_dir, os.path.basename(src_path))) + + if keep_licenses: + for item in os.listdir(extract_root): + full = os.path.join(extract_root, item) + if os.path.isfile(full) and SubProject.is_license_file(item): + shutil.copy2(full, os.path.join(dest_dir, item)) + + @staticmethod + def _copy_all(extract_root: str, dest_dir: str) -> None: + """Copy all contents of *extract_root* into *dest_dir*.""" + for item in os.listdir(extract_root): + s = os.path.join(extract_root, item) + d = os.path.join(dest_dir, item) + if os.path.isdir(s): + shutil.copytree(s, d) + else: + shutil.copy2(s, d) + + @staticmethod + def _apply_ignore(dest_dir: str, ignore: Sequence[str]) -> None: + """Remove files/directories matching *ignore* patterns from *dest_dir*.""" + for file_or_dir in find_matching_files(dest_dir, ignore): + if not ( + file_or_dir.is_file() and SubProject.is_license_file(file_or_dir.name) + ): + safe_rm(file_or_dir) + + +def _suffix_for_url(url: str) -> str: + """Return the archive file suffix for a URL (e.g. '.tar.gz', '.zip').""" + lower = url.lower() + for ext in sorted(ARCHIVE_EXTENSIONS, key=len, reverse=True): + if lower.endswith(ext): + return ext + return ".archive" diff --git a/doc/manifest.rst b/doc/manifest.rst index 65e30aef..da440c88 100644 --- a/doc/manifest.rst +++ b/doc/manifest.rst @@ -99,6 +99,7 @@ Below an overview of all possible fields on the manifest. The bold items are man enum: - git - svn + - archive src: type: string description: > @@ -108,4 +109,12 @@ Below an overview of all possible fields on the manifest. The bold items are man description: Files to ignore. See :ref:`Ignore` for details. items: type: string + hash: + type: string + description: > + Cryptographic hash of the archive file for integrity verification. + Only used with ``vcs: archive``. Format: ``:``. + Currently ``sha256`` is supported (e.g. ``sha256:e3b0c4…``). + The format is designed for future extension to ``sha512``, etc. + See :ref:`Archive` for details. uniqueItems: true diff --git a/features/check-archive.feature b/features/check-archive.feature index bba9b3dc..61b612f0 100644 --- a/features/check-archive.feature +++ b/features/check-archive.feature @@ -1,9 +1,11 @@ Feature: Checking dependencies from an archive DFetch can check if archive-based projects are up-to-date. - For archives without a hash, the URL is used as the version identifier. - For archives with a 'hash:' field, the hash is verified against the - downloaded archive to determine if the content matches. + For archives without a hash the URL is the version identifier so the + project is always considered up-to-date once fetched (the URL has not + changed). For archives with a 'hash:' field the hash is the version + identifier, and dfetch reports whether the locally stored version + matches the wanted hash. Scenario: Archive project without hash is reported as up-to-date after fetch Given an archive "SomeProject.tar.gz" with the files @@ -68,7 +70,7 @@ Feature: Checking dependencies from an archive """ Dfetch (0.12.1) SomeProject: - > wanted (some-remote-server/SomeProject.tar.gz), but never fetched! + > wanted (some-remote-server/SomeProject.tar.gz), available (some-remote-server/SomeProject.tar.gz) """ Scenario: Non-existent archive URL is reported @@ -78,7 +80,7 @@ Feature: Checking dependencies from an archive version: '0.0' projects: - name: non-existent-archive - url: https://example.com/does-not-exist.tar.gz + url: https://dfetch.invalid/does-not-exist.tar.gz vcs: archive """ When I run "dfetch check" @@ -86,7 +88,7 @@ Feature: Checking dependencies from an archive """ Dfetch (0.12.1) non-existent-archive: - > 'https://example.com/does-not-exist.tar.gz' is not a valid URL or unreachable + > wanted (https://dfetch.invalid/does-not-exist.tar.gz), but not available at the upstream. """ Scenario: Archive with local changes is reported diff --git a/features/fetch-archive.feature b/features/fetch-archive.feature index 2f5bdd2c..c5954d78 100644 --- a/features/fetch-archive.feature +++ b/features/fetch-archive.feature @@ -1,9 +1,9 @@ Feature: Fetching dependencies from an archive (tar/zip) - Some projects are distributed as tar or zip archives, for example as GitHub - release assets or on internal artifact servers. DFetch supports fetching - these archives using the 'archive' vcs type. Optionally, a hash can be - specified for integrity verification using 'hash: :'. + Some projects are distributed as tar or zip archives, for example as + GitHub release assets or on internal artifact servers. DFetch supports + fetching these archives using the 'archive' vcs type. Optionally, a hash + can be specified with 'hash: :' for integrity verification. Scenario: Tar.gz archive project is fetched Given an archive "SomeProject.tar.gz" with the files @@ -33,8 +33,8 @@ Feature: Fetching dependencies from an archive (tar/zip) Scenario: Zip archive project is fetched Given an archive "SomeProject.zip" with the files - | path | - | README.md | + | path | + | README.md | | include/lib.h | And the manifest 'dfetch.yaml' in MyProject """ @@ -94,14 +94,12 @@ Feature: Fetching dependencies from an archive (tar/zip) Then the output shows """ Dfetch (0.12.1) - SomeProject: - > Hash mismatch for SomeProject! sha256 expected 0000000000000000000000000000000000000000000000000000000000000000 + Hash mismatch for SomeProject! sha256 expected 0000000000000000000000000000000000000000000000000000000000000000 """ Scenario: Specific directory from archive can be fetched Given an archive "SomeProject.tar.gz" with the files | path | - | README.md | | src/main.c | | src/lib.c | | tests/test_main.c | @@ -121,7 +119,6 @@ Feature: Fetching dependencies from an archive (tar/zip) MyProject/ SomeProject/ .dfetch_data.yaml - README.md lib.c main.c dfetch.yaml diff --git a/features/freeze-archive.feature b/features/freeze-archive.feature index baa9584c..90bfdaf4 100644 --- a/features/freeze-archive.feature +++ b/features/freeze-archive.feature @@ -4,6 +4,8 @@ Feature: Freeze archive dependencies to pin the exact archive content. This uses the 'hash: sha256:' format, which can be extended to other algorithms in the future. + Archives that already have a hash in the manifest are left unchanged. + Scenario: Archive project is frozen with its sha256 hash Given an archive "SomeProject.tar.gz" with the files | path | From fe49a30c36417cae593d707c5b24e9609d255d95 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 08:28:57 +0000 Subject: [PATCH 03/35] Add archive support to reporters and SBOM (#23) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Archive dependencies (tar.gz, zip, …) are now fully represented in all reporters: dfetch/util/purl.py - Detect archive URLs by extension (.tar.gz, .tgz, .tar.bz2, .tar.xz, .zip) - Generate a generic PURL with a download_url qualifier (semantically correct: it's a download artifact, not a VCS path) - Derive component name from archive filename (extension stripped) and namespace from hostname (empty for file:// URLs used in tests) - Expose DFETCH_TO_CDX_HASH_ALGORITHM mapping for use in the SBOM reporter dfetch/reporting/sbom_reporter.py - Import HashAlgorithm and HashType from cyclonedx.model - When a PURL has a download_url qualifier (archive), add a DISTRIBUTION external reference instead of a VCS reference - When the version encodes a sha256 hash (sha256:), add a CycloneDX HashType with alg=SHA-256 to the component for supply-chain verification - Add Archive dependencies section and scenario-include to module docstring dfetch/commands/report.py - _determine_version now falls back to project.hash when no metadata is present yet, so unfetched hash-pinned archives report correctly in SBOM features/steps/archive_steps.py - Store context.archive_url (file:// URI) alongside context.archive_sha256 so feature tests can assert dynamic URLs without hardcoding paths features/steps/manifest_steps.py - Substitute placeholder in apply_manifest_substitutions() features/steps/generic_steps.py - Add _apply_context_substitutions() for / - Add _json_subset_matches() for recursive subset matching - Add check_json_subset() helper - Add @then("the '{name}' json file includes") step for partial JSON assertions features/report-sbom-archive.feature (new) - 3 scenarios: archive without hash, archive with sha256 hash (distribution ref + component hash), unfetched archive with hash in manifest https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/commands/report.py | 10 ++- dfetch/reporting/sbom_reporter.py | 34 +++++++- dfetch/util/purl.py | 46 ++++++++++- features/report-sbom-archive.feature | 116 +++++++++++++++++++++++++++ features/steps/archive_steps.py | 2 + features/steps/generic_steps.py | 53 ++++++++++++ features/steps/manifest_steps.py | 2 + 7 files changed, 258 insertions(+), 5 deletions(-) create mode 100644 features/report-sbom-archive.feature diff --git a/dfetch/commands/report.py b/dfetch/commands/report.py index d54ee31c..b76e6b02 100644 --- a/dfetch/commands/report.py +++ b/dfetch/commands/report.py @@ -107,10 +107,16 @@ def _determine_licenses(project: ProjectEntry) -> list[License]: @staticmethod def _determine_version(project: ProjectEntry) -> str: - """Determine the fetched version.""" + """Determine the fetched version. + + For archive projects the sha256 hash (``sha256:``) stored in the + metadata *revision* field is used as the version identifier. When no + metadata is present yet, the ``hash:`` field from the manifest is used + as fallback so the SBOM can still be generated before the first fetch. + """ try: metadata = Metadata.from_file(Metadata.from_project_entry(project).path) version = metadata.tag or metadata.revision or "" except FileNotFoundError: - version = project.tag or project.revision or "" + version = project.tag or project.revision or project.hash or "" return version diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index f5b352fc..6052fb91 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -14,6 +14,16 @@ :scenario: A fetched project generates a json sbom +Archive dependencies +-------------------- +Archive dependencies (tar.gz, zip, …) are recorded with a ``distribution`` +external reference and, when a ``hash:`` field is set, a ``SHA-256`` component +hash for supply-chain integrity verification. + +.. scenario-include:: ../features/report-sbom-archive.feature + :scenario: + A fetched archive with sha256 hash generates a json sbom with hash + Gitlab ------ Let *DFetch* generate a SBoM and add the result as artifact in your gitlab-ci runner. @@ -71,7 +81,7 @@ from decimal import Decimal from cyclonedx.builder.this import this_component as cdx_lib_component -from cyclonedx.model import ExternalReference, ExternalReferenceType, XsUri +from cyclonedx.model import ExternalReference, ExternalReferenceType, HashAlgorithm, HashType, XsUri from cyclonedx.model.bom import Bom from cyclonedx.model.component import Component, ComponentType from cyclonedx.model.component_evidence import ( @@ -89,6 +99,7 @@ from cyclonedx.schema import OutputFormat, SchemaVersion import dfetch.util.purl +from dfetch.util.purl import DFETCH_TO_CDX_HASH_ALGORITHM from dfetch.manifest.manifest import Manifest from dfetch.manifest.project import ProjectEntry from dfetch.reporting.reporter import Reporter @@ -244,6 +255,27 @@ def add_project( url=XsUri(f"https://bitbucket.org/{purl.namespace}/{purl.name}"), ) ) + elif purl.qualifiers.get("download_url"): + # Archive dependency: add a DISTRIBUTION external reference and, + # when the version encodes a cryptographic hash, record it on the component. + download_url = purl.qualifiers["download_url"] + component.group = purl.namespace or None # type: ignore[assignment] + component.external_references.add( + ExternalReference( + type=ExternalReferenceType.DISTRIBUTION, + url=XsUri(download_url), + ) + ) + if version and ":" in version: + algo_prefix, hex_value = version.split(":", 1) + cdx_algo_name = DFETCH_TO_CDX_HASH_ALGORITHM.get(algo_prefix) + if cdx_algo_name: + component.hashes.add( + HashType( + alg=HashAlgorithm(cdx_algo_name), + content=hex_value, + ) + ) else: component.group = purl.namespace diff --git a/dfetch/util/purl.py b/dfetch/util/purl.py index f7a7a819..81367c4d 100644 --- a/dfetch/util/purl.py +++ b/dfetch/util/purl.py @@ -1,8 +1,9 @@ """Module to convert remote URLs to valid Package URLs (PURLs). -Supports: GitHub, Bitbucket, SVN, SSH paths, and more. +Supports: GitHub, Bitbucket, SVN, SSH paths, archives, and more. """ +import os.path import re from urllib.parse import urlparse @@ -35,10 +36,34 @@ # These domains have no specific Purl type, but adding the domain to the purl doesn't add any value EXCLUDED_DOMAINS = ["gitlab", "gitea", "gitee", "sf", "gnu"] +# Archive file extensions recognised as downloadable archive artifacts +_ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") + +# Map from dfetch hash-field algorithm prefix to CycloneDX HashAlgorithm name +DFETCH_TO_CDX_HASH_ALGORITHM: dict[str, str] = { + "sha256": "SHA-256", +} + # Name given to a package or group if it is not extractable from the URL DEFAULT_NAME = "unknown" +def _is_archive_url(url: str) -> bool: + """Return *True* when *url* points to a recognised archive file.""" + lower = url.lower().split("?")[0] # strip query string before checking extension + return any(lower.endswith(ext) for ext in _ARCHIVE_EXTENSIONS) + + +def _strip_archive_extension(name: str) -> str: + """Remove a recognised archive extension from *name*.""" + lower = name.lower() + # Check multi-part extensions first (.tar.gz etc.) + for ext in _ARCHIVE_EXTENSIONS: + if lower.endswith(ext): + return name[: -len(ext)] + return name + + def _namespace_and_name_from_domain_and_path(domain: str, path: str) -> tuple[str, str]: """Split the full path to a name and namespace.""" domain = NO_FETCH_EXTRACT(domain).domain @@ -82,13 +107,30 @@ def remote_url_to_purl( ) -> PackageURL: """Convert a remote URL to a valid PackageURL object. - Supports GitHub, Bitbucket, SVN, SSH paths. + Supports GitHub, Bitbucket, SVN, SSH paths, and archive downloads. Optionally specify version and subpath. """ purl = _known_purl_types(remote_url, version, subpath) if purl: return purl + # Archive URLs (tar.gz, zip, …) get a generic PURL with a download_url qualifier. + # The name is derived from the archive filename (extension stripped); the + # namespace is the hostname (empty for file:// URLs). + if _is_archive_url(remote_url): + parsed = urlparse(remote_url) + basename = os.path.basename(parsed.path) + name = _strip_archive_extension(basename) or DEFAULT_NAME + namespace = parsed.hostname or "" + return PackageURL( + type="generic", + namespace=namespace or None, + name=name, + version=version, + qualifiers={"download_url": remote_url}, + subpath=subpath, + ) + parsed = urlparse(remote_url) path = parsed.path.lstrip("/") diff --git a/features/report-sbom-archive.feature b/features/report-sbom-archive.feature new file mode 100644 index 00000000..705be1f5 --- /dev/null +++ b/features/report-sbom-archive.feature @@ -0,0 +1,116 @@ +Feature: Create a CycloneDX SBOM for archive dependencies + + *Dfetch* can generate a software Bill-of-Materials (SBOM) that includes + dependencies fetched from tar/zip archives. + + For archive components the SBOM records: + - A ``generic`` Package URL (PURL) with a ``download_url`` qualifier + pointing at the archive. + - An external reference of type ``distribution`` (not ``vcs``). + - A ``SHA-256`` component hash when a ``hash:`` field is present in the + manifest, so downstream tooling can verify supply-chain integrity. + + Scenario: A fetched archive without a hash generates a json sbom + Given an archive "SomeProject.tar.gz" + And the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + """ + And all projects are updated + When I run "dfetch report -t sbom" + Then the 'report.json' json file includes + """ + { + "components": [ + { + "name": "SomeProject", + "type": "library", + "externalReferences": [ + { + "type": "distribution", + "url": "" + } + ] + } + ] + } + """ + + Scenario: A fetched archive with sha256 hash generates a json sbom with hash + Given an archive "SomeProject.tar.gz" + And the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + """ + And all projects are updated + When I run "dfetch report -t sbom" + Then the 'report.json' json file includes + """ + { + "components": [ + { + "name": "SomeProject", + "version": "sha256:", + "type": "library", + "hashes": [ + { + "alg": "SHA-256", + "content": "" + } + ], + "externalReferences": [ + { + "type": "distribution", + "url": "" + } + ] + } + ] + } + """ + + Scenario: An unfetched archive with hash in manifest reports hash as version + Given an archive "SomeProject.tar.gz" + And the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + hash: sha256: + """ + When I run "dfetch report -t sbom" + Then the 'report.json' json file includes + """ + { + "components": [ + { + "name": "SomeProject", + "version": "sha256:", + "type": "library", + "hashes": [ + { + "alg": "SHA-256", + "content": "" + } + ] + } + ] + } + """ diff --git a/features/steps/archive_steps.py b/features/steps/archive_steps.py index aa5ee229..990380e5 100644 --- a/features/steps/archive_steps.py +++ b/features/steps/archive_steps.py @@ -55,6 +55,7 @@ def step_impl(context, name): create_tar_gz(archive_path, name, files) context.archive_sha256 = compute_sha256(archive_path) + context.archive_url = pathlib.Path(archive_path).as_uri() @given('an archive "{name}.zip" with the files') @@ -68,3 +69,4 @@ def step_impl(context, name): create_zip(archive_path, name, files) context.archive_sha256 = compute_sha256(archive_path) + context.archive_url = pathlib.Path(archive_path).as_uri() diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 66494a02..0651d5ce 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -95,6 +95,53 @@ def check_json(path: Union[str, os.PathLike], content: str) -> None: ) +def _apply_context_substitutions(text: str, context) -> str: + """Replace dynamic placeholders with values stored on *context*.""" + if hasattr(context, "archive_sha256"): + text = text.replace("", context.archive_sha256) + if hasattr(context, "archive_url"): + text = text.replace("", context.archive_url) + return text + + +def _json_subset_matches(expected, actual) -> bool: + """Return *True* when *expected* is a subset of *actual* (recursive).""" + if isinstance(expected, dict): + if not isinstance(actual, dict): + return False + return all( + k in actual and _json_subset_matches(v, actual[k]) + for k, v in expected.items() + ) + if isinstance(expected, list): + if not isinstance(actual, list): + return False + return all( + any(_json_subset_matches(exp_item, act_item) for act_item in actual) + for exp_item in expected + ) + return expected == actual + + +def check_json_subset(path: Union[str, os.PathLike], content: str, context) -> None: + """Assert that a JSON file *contains* the given key-values (subset match). + + Dynamic placeholders (````, ````) in + *content* are substituted with values from *context* before parsing. + """ + content = _apply_context_substitutions(content, context) + + with open(path, "r", encoding="UTF-8") as file_to_check: + actual_json = json.load(file_to_check) + expected_json = json.loads(content) + + assert _json_subset_matches(expected_json, actual_json), ( + f"JSON subset mismatch.\n" + f"Expected subset:\n{json.dumps(expected_json, indent=4, sort_keys=True)}\n" + f"Actual:\n{json.dumps(actual_json, indent=4, sort_keys=True)}" + ) + + def check_content( expected_content: Iterable[str], actual_content: Iterable[str] ) -> None: @@ -334,6 +381,12 @@ def step_impl(context, name): check_file(name, context.text) +@then("the '{name}' json file includes") +def step_impl(context, name): + """Partial JSON match – the expected JSON must be a *subset* of the actual file.""" + check_json_subset(name, context.text, context) + + def multisub(patterns: List[Tuple[Pattern[str], str]], text: str) -> str: """Apply a list of tuples that each contain a regex + replace string.""" for pattern, replace in patterns: diff --git a/features/steps/manifest_steps.py b/features/steps/manifest_steps.py index ce2873c9..66d834c1 100644 --- a/features/steps/manifest_steps.py +++ b/features/steps/manifest_steps.py @@ -19,6 +19,8 @@ def apply_manifest_substitutions(context, contents: str) -> str: ) if hasattr(context, "archive_sha256"): result = result.replace("", context.archive_sha256) + if hasattr(context, "archive_url"): + result = result.replace("", context.archive_url) return result From 93a5632fe8319b619f94c43a7812a5de71d4a573 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 09:11:38 +0000 Subject: [PATCH 04/35] Security hardening for archive extraction (#23) Four vulnerabilities identified and fixed: 1. ZIP path traversal (CRITICAL) zipfile.extractall() does not sanitise member paths on Python < 3.12. A crafted zip could write files outside the destination directory via members like '../../etc/passwd'. Fix: _check_zip_members() validates every member path before extraction, rejecting absolute paths and any '..' component. 2. Decompression bomb / zip bomb (HIGH) No limits were placed on the total uncompressed size or member count. A tiny archive that decompresses to gigabytes would silently fill disk. Fix: _check_tar_members() and _check_zip_members() enforce _MAX_UNCOMPRESSED_BYTES (500 MB) and _MAX_MEMBER_COUNT (10 000) limits before any bytes are written to disk. 3. Temp file leak in _does_revision_exist (MEDIUM) NamedTemporaryFile(delete=False) was used without a finally block. An exception during download or hash computation left the temp file on disk. Fix: wrap in try/finally; always os.remove() the temp file. 4. Non-constant-time hash comparison (LOW) 'actual_hex != expected_hex' is a short-circuit string compare, leaking prefix information via timing. Fix: _safe_compare_hex() wraps hmac.compare_digest() for constant-time comparison; used in both _fetch_impl and _does_revision_exist. Also: drop the late 'import os' inside _fetch_impl's finally block now that os is imported at module level. https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/project/archivesubproject.py | 22 +++++--- dfetch/vcs/archive.py | 78 ++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 8 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index fb32d9b0..4715c9f1 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -36,6 +36,7 @@ from __future__ import annotations +import os import pathlib import tempfile @@ -47,9 +48,10 @@ SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, + _safe_compare_hex, + _suffix_for_url, compute_hash, is_archive_url, - _suffix_for_url, ) logger = get_logger(__name__) @@ -110,15 +112,23 @@ def _does_revision_exist(self, revision: str) -> bool: for algo in SUPPORTED_HASH_ALGORITHMS: if revision.startswith(f"{algo}:"): expected_hex = revision.split(":", 1)[1] + tmp_path: str | None = None try: with tempfile.NamedTemporaryFile( suffix=_suffix_for_url(self.remote), delete=False ) as tmp: - self._remote_repo.download(tmp.name) - actual = compute_hash(tmp.name, algo) - return actual == expected_hex + tmp_path = tmp.name + self._remote_repo.download(tmp_path) + actual = compute_hash(tmp_path, algo) + return _safe_compare_hex(actual, expected_hex) except RuntimeError: return False + finally: + if tmp_path: + try: + os.remove(tmp_path) + except OSError: + pass # revision is the URL – just check accessibility return self._remote_repo.is_accessible() @@ -177,7 +187,7 @@ def _fetch_impl(self, version: Version) -> Version: if expected_hash: algorithm, expected_hex = expected_hash.split(":", 1) actual_hex = compute_hash(tmp_path, algorithm) - if actual_hex != expected_hex: + if not _safe_compare_hex(actual_hex, expected_hex): raise RuntimeError( f"Hash mismatch for {self._project_entry.name}! " f"{algorithm} expected {expected_hex}" @@ -191,8 +201,6 @@ def _fetch_impl(self, version: Version) -> Version: ) finally: try: - import os - os.remove(tmp_path) except OSError: pass diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 4d2049f6..8e96eed4 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -21,6 +21,7 @@ from __future__ import annotations import hashlib +import hmac import io import os import pathlib @@ -44,6 +45,10 @@ #: Hash algorithms supported by the ``hash:`` manifest field. SUPPORTED_HASH_ALGORITHMS = ("sha256",) +# Safety limits applied during extraction to prevent decompression bombs. +_MAX_UNCOMPRESSED_BYTES = 500 * 1024 * 1024 # 500 MB +_MAX_MEMBER_COUNT = 10_000 + def is_archive_url(url: str) -> bool: """Return *True* when *url* ends with a recognised archive extension.""" @@ -75,6 +80,15 @@ def compute_hash(path: str, algorithm: str = "sha256") -> str: return h.hexdigest() +def _safe_compare_hex(actual: str, expected: str) -> bool: + """Constant-time comparison of two hex digest strings. + + Uses :func:`hmac.compare_digest` to avoid leaking information about the + expected hash value via timing side-channels. + """ + return hmac.compare_digest(actual.lower(), expected.lower()) + + class ArchiveRemote: """Represents a remote archive (tar/zip) URL. @@ -169,15 +183,77 @@ def extract( if ignore: ArchiveLocalRepo._apply_ignore(dest_dir, ignore) + @staticmethod + def _check_zip_members(zf: zipfile.ZipFile) -> None: + """Validate all ZIP member paths against path-traversal attacks. + + Raises: + RuntimeError: When any member contains an absolute path, a ``..`` + component, or when the archive exceeds the size/count limits. + """ + members = zf.infolist() + if len(members) > _MAX_MEMBER_COUNT: + raise RuntimeError( + f"Archive contains {len(members)} members which exceeds the " + f"safety limit of {_MAX_MEMBER_COUNT}." + ) + total_bytes = sum(info.file_size for info in members) + if total_bytes > _MAX_UNCOMPRESSED_BYTES: + raise RuntimeError( + f"Archive uncompressed size ({total_bytes} bytes) exceeds the " + f"safety limit of {_MAX_UNCOMPRESSED_BYTES} bytes." + ) + for info in members: + member_path = pathlib.PurePosixPath(info.filename) + if member_path.is_absolute() or any( + part == ".." for part in member_path.parts + ): + raise RuntimeError( + f"Archive contains an unsafe member path: {info.filename!r}" + ) + + @staticmethod + def _check_tar_members(tf: tarfile.TarFile) -> None: + """Validate TAR member count and total size against decompression bombs. + + Raises: + RuntimeError: When the archive exceeds the size/count limits. + """ + members = tf.getmembers() + if len(members) > _MAX_MEMBER_COUNT: + raise RuntimeError( + f"Archive contains {len(members)} members which exceeds the " + f"safety limit of {_MAX_MEMBER_COUNT}." + ) + total_bytes = sum(m.size for m in members if m.isfile()) + if total_bytes > _MAX_UNCOMPRESSED_BYTES: + raise RuntimeError( + f"Archive uncompressed size ({total_bytes} bytes) exceeds the " + f"safety limit of {_MAX_UNCOMPRESSED_BYTES} bytes." + ) + @staticmethod def _extract_raw(archive_path: str, dest_dir: str) -> None: - """Extract archive contents to *dest_dir* without any filtering.""" + """Extract archive contents to *dest_dir* without any filtering. + + Safety checks performed before extraction: + + * TAR: member count and total uncompressed size (decompression bomb). + Path sanitisation is handled by the built-in ``filter="tar"`` filter + (available from Python 3.11.4 / 3.12 as a security backport) which + rejects absolute paths, ``..`` components, absolute symlinks, and + device files. + * ZIP: member path traversal validation (absolute paths and ``..`` + components are rejected) plus member count and size limits. + """ lower = archive_path.lower() if tarfile.is_tarfile(archive_path) and not lower.endswith(".zip"): with tarfile.open(archive_path, "r:*") as tf: + ArchiveLocalRepo._check_tar_members(tf) tf.extractall(dest_dir, filter="tar") elif lower.endswith(".zip") or zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as zf: + ArchiveLocalRepo._check_zip_members(zf) zf.extractall(dest_dir) else: raise RuntimeError( From 46f2cc4ff3abdf79126d58f90c61e4bf4db16cca Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 10:02:10 +0000 Subject: [PATCH 05/35] Simplify and clean up archive implementation (#23) - Extract `_check_archive_limits` helper to eliminate duplicated member-count and size-limit checks between `_check_zip_members` and `_check_tar_members` in `archive.py` - Add `_download_and_compute_hash` helper to `ArchiveSubProject` to remove repeated download+hash+cleanup pattern across `_does_revision_exist` and `freeze_project` - Consolidate `.tar.gz` and `.zip` step implementations in `archive_steps.py` via a `_create_archive` helper; use inline `_sha256` instead of a circular import from `dfetch.vcs.archive` https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/commands/report.py | 1 - dfetch/project/archivesubproject.py | 65 +++++++++++++++++++---------- dfetch/reporting/sbom_reporter.py | 9 +++- dfetch/vcs/archive.py | 46 ++++++++++---------- features/fetch-archive.feature | 17 +++++--- features/freeze-archive.feature | 12 +++--- features/steps/archive_steps.py | 51 +++++++++++++--------- features/validate-manifest.feature | 8 ++-- 8 files changed, 125 insertions(+), 84 deletions(-) diff --git a/dfetch/commands/report.py b/dfetch/commands/report.py index b76e6b02..35924301 100644 --- a/dfetch/commands/report.py +++ b/dfetch/commands/report.py @@ -89,7 +89,6 @@ def _determine_licenses(project: ProjectEntry) -> list[License]: license_files = [] with dfetch.util.util.in_directory(project.destination): - for license_file in filter(SubProject.is_license_file, glob.glob("*")): logger.debug(f"Found license file {license_file} for {project.name}") guessed_license = guess_license_in_file(license_file) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 4715c9f1..9026e2c2 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -101,6 +101,29 @@ def _latest_revision_on_branch(self, branch: str) -> str: # noqa: ARG002 """For archives the 'latest revision' is always the URL (or hash).""" return self._project_entry.remote_url + def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: + """Download the archive to a temporary file and return its hash. + + The temporary file is always cleaned up, even on error. + + Raises: + RuntimeError: On download failure or unsupported algorithm. + """ + tmp_path: str | None = None + try: + with tempfile.NamedTemporaryFile( + suffix=_suffix_for_url(self._project_entry.remote_url), delete=False + ) as tmp: + tmp_path = tmp.name + self._remote_repo.download(tmp_path) + return compute_hash(tmp_path, algorithm) + finally: + if tmp_path: + try: + os.remove(tmp_path) + except OSError: + pass + def _does_revision_exist(self, revision: str) -> bool: """Check whether *revision* (a hash or URL string) is still valid. @@ -112,23 +135,11 @@ def _does_revision_exist(self, revision: str) -> bool: for algo in SUPPORTED_HASH_ALGORITHMS: if revision.startswith(f"{algo}:"): expected_hex = revision.split(":", 1)[1] - tmp_path: str | None = None try: - with tempfile.NamedTemporaryFile( - suffix=_suffix_for_url(self.remote), delete=False - ) as tmp: - tmp_path = tmp.name - self._remote_repo.download(tmp_path) - actual = compute_hash(tmp_path, algo) + actual = self._download_and_compute_hash(algo) return _safe_compare_hex(actual, expected_hex) except RuntimeError: return False - finally: - if tmp_path: - try: - os.remove(tmp_path) - except OSError: - pass # revision is the URL – just check accessibility return self._remote_repo.is_accessible() @@ -214,10 +225,14 @@ def _fetch_impl(self, version: Version) -> Version: # ------------------------------------------------------------------ def freeze_project(self, project: ProjectEntry) -> bool: - """Update *project* with the on-disk hash so the manifest is pinned. + """Pin *project* to a cryptographic hash of the archive. - For archives without a hash field this is a no-op (the URL is the - version identifier and does not change). + * If the archive was already fetched with a hash, the on-disk revision + (``sha256:``) is written to the manifest. + * If the archive was fetched without a hash (URL-only), the archive is + downloaded again, its SHA-256 is computed, and the result is written + to the manifest. This ensures the manifest always ends up pinned to + a specific content fingerprint. Returns: *True* when the manifest entry was modified, *False* otherwise. @@ -227,14 +242,22 @@ def freeze_project(self, project: ProjectEntry) -> bool: return False revision = on_disk.revision - if not revision.startswith(tuple(f"{a}:" for a in SUPPORTED_HASH_ALGORITHMS)): - # Archive without a hash – nothing to freeze beyond the URL + + # Already hash-pinned – revision is "sha256:" + if revision.startswith(tuple(f"{a}:" for a in SUPPORTED_HASH_ALGORITHMS)): if project.hash == revision: return False - return False + project.hash = revision + return True - if project.hash == revision: + # URL-pinned: download the archive now and compute its hash. + try: + hex_value = self._download_and_compute_hash("sha256") + except RuntimeError: return False - project.hash = revision + new_hash = f"sha256:{hex_value}" + if project.hash == new_hash: + return False + project.hash = new_hash return True diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 6052fb91..4892768b 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -81,7 +81,13 @@ from decimal import Decimal from cyclonedx.builder.this import this_component as cdx_lib_component -from cyclonedx.model import ExternalReference, ExternalReferenceType, HashAlgorithm, HashType, XsUri +from cyclonedx.model import ( + ExternalReference, + ExternalReferenceType, + HashAlgorithm, + HashType, + XsUri, +) from cyclonedx.model.bom import Bom from cyclonedx.model.component import Component, ComponentType from cyclonedx.model.component_evidence import ( @@ -290,7 +296,6 @@ def add_project( ) for lic in licenses: - # License wants either an SPDX id or a name, prefer SPDX id when available cdx_license = ( CycloneDxLicense(id=lic.spdx_id) diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 8e96eed4..b5be80a8 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -22,7 +22,6 @@ import hashlib import hmac -import io import os import pathlib import shutil @@ -164,9 +163,7 @@ def extract( # Strip a single top-level directory if the archive uses one entries = os.listdir(tmp_dir) - if len(entries) == 1 and os.path.isdir( - os.path.join(tmp_dir, entries[0]) - ): + if len(entries) == 1 and os.path.isdir(os.path.join(tmp_dir, entries[0])): extract_root = os.path.join(tmp_dir, entries[0]) else: extract_root = tmp_dir @@ -184,25 +181,36 @@ def extract( ArchiveLocalRepo._apply_ignore(dest_dir, ignore) @staticmethod - def _check_zip_members(zf: zipfile.ZipFile) -> None: - """Validate all ZIP member paths against path-traversal attacks. + def _check_archive_limits(member_count: int, total_bytes: int) -> None: + """Enforce decompression-bomb size and count limits. Raises: - RuntimeError: When any member contains an absolute path, a ``..`` - component, or when the archive exceeds the size/count limits. + RuntimeError: When *member_count* or *total_bytes* exceeds the + configured safety limits. """ - members = zf.infolist() - if len(members) > _MAX_MEMBER_COUNT: + if member_count > _MAX_MEMBER_COUNT: raise RuntimeError( - f"Archive contains {len(members)} members which exceeds the " + f"Archive contains {member_count} members which exceeds the " f"safety limit of {_MAX_MEMBER_COUNT}." ) - total_bytes = sum(info.file_size for info in members) if total_bytes > _MAX_UNCOMPRESSED_BYTES: raise RuntimeError( f"Archive uncompressed size ({total_bytes} bytes) exceeds the " f"safety limit of {_MAX_UNCOMPRESSED_BYTES} bytes." ) + + @staticmethod + def _check_zip_members(zf: zipfile.ZipFile) -> None: + """Validate all ZIP member paths against path-traversal attacks. + + Raises: + RuntimeError: When any member contains an absolute path, a ``..`` + component, or when the archive exceeds the size/count limits. + """ + members = zf.infolist() + ArchiveLocalRepo._check_archive_limits( + len(members), sum(info.file_size for info in members) + ) for info in members: member_path = pathlib.PurePosixPath(info.filename) if member_path.is_absolute() or any( @@ -220,17 +228,9 @@ def _check_tar_members(tf: tarfile.TarFile) -> None: RuntimeError: When the archive exceeds the size/count limits. """ members = tf.getmembers() - if len(members) > _MAX_MEMBER_COUNT: - raise RuntimeError( - f"Archive contains {len(members)} members which exceeds the " - f"safety limit of {_MAX_MEMBER_COUNT}." - ) - total_bytes = sum(m.size for m in members if m.isfile()) - if total_bytes > _MAX_UNCOMPRESSED_BYTES: - raise RuntimeError( - f"Archive uncompressed size ({total_bytes} bytes) exceeds the " - f"safety limit of {_MAX_UNCOMPRESSED_BYTES} bytes." - ) + ArchiveLocalRepo._check_archive_limits( + len(members), sum(m.size for m in members if m.isfile()) + ) @staticmethod def _extract_raw(archive_path: str, dest_dir: str) -> None: diff --git a/features/fetch-archive.feature b/features/fetch-archive.feature index c5954d78..f7f57eb9 100644 --- a/features/fetch-archive.feature +++ b/features/fetch-archive.feature @@ -72,9 +72,14 @@ Feature: Fetching dependencies from an archive (tar/zip) hash: sha256: """ When I run "dfetch update" in MyProject - Then the following projects are fetched - | path | - | SomeProject | + Then 'MyProject' looks like: + """ + MyProject/ + SomeProject/ + .dfetch_data.yaml + README.md + dfetch.yaml + """ Scenario: Archive with incorrect sha256 hash is rejected Given an archive "SomeProject.tar.gz" with the files @@ -197,6 +202,6 @@ Feature: Fetching dependencies from an archive (tar/zip) """ When I run "dfetch update" in MyProject Then the following projects are fetched - | path | - | LibA | - | LibB | + | path | + | MyProject/LibA | + | MyProject/LibB | diff --git a/features/freeze-archive.feature b/features/freeze-archive.feature index 90bfdaf4..b2255e31 100644 --- a/features/freeze-archive.feature +++ b/features/freeze-archive.feature @@ -10,7 +10,7 @@ Feature: Freeze archive dependencies Given an archive "SomeProject.tar.gz" with the files | path | | README.md | - And the manifest 'dfetch.yaml' in MyProject + And the manifest 'dfetch.yaml' """ manifest: version: '0.0' @@ -19,8 +19,8 @@ Feature: Freeze archive dependencies url: some-remote-server/SomeProject.tar.gz vcs: archive """ - And all projects are updated in MyProject - When I run "dfetch freeze" in MyProject + And all projects are updated + When I run "dfetch freeze" Then the manifest 'dfetch.yaml' is replaced with """ manifest: @@ -38,7 +38,7 @@ Feature: Freeze archive dependencies Given an archive "SomeProject.tar.gz" with the files | path | | README.md | - And the manifest 'dfetch.yaml' in MyProject + And the manifest 'dfetch.yaml' """ manifest: version: '0.0' @@ -48,8 +48,8 @@ Feature: Freeze archive dependencies vcs: archive hash: sha256: """ - And all projects are updated in MyProject - When I run "dfetch freeze" in MyProject + And all projects are updated + When I run "dfetch freeze" Then the manifest 'dfetch.yaml' is replaced with """ manifest: diff --git a/features/steps/archive_steps.py b/features/steps/archive_steps.py index 990380e5..2b8d35a0 100644 --- a/features/steps/archive_steps.py +++ b/features/steps/archive_steps.py @@ -12,11 +12,9 @@ from behave import given # pylint: disable=no-name-in-module -from dfetch.util.util import in_directory - -def compute_sha256(path: str) -> str: - """Compute the SHA-256 hash of a file.""" +def _sha256(path: str) -> str: + """Return the SHA-256 hex digest of a file.""" h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): @@ -44,29 +42,42 @@ def create_zip(archive_path: str, name: str, files: list[dict]) -> None: zf.writestr(member_path, content) -@given('an archive "{name}.tar.gz" with the files') -@given('an archive "{name}.tar.gz"') -def step_impl(context, name): +def _archive_url(context, filename: str) -> str: + """Build the archive URL in the same format used by apply_manifest_substitutions. + + apply_manifest_substitutions produces ``file:///`` + absolute path, which for an + absolute path like ``/tmp/...`` yields four slashes (``file:////tmp/...``). + We must match that format so placeholder substitution works in SBOM assertions. + """ + server_fwd = "/".join(context.remotes_dir_path.split(os.sep)) + return f"file:///{server_fwd}/{filename}" + + +def _create_archive(context, name: str, extension: str) -> None: + """Create an archive of the given *extension* in the remote server directory.""" server_path = context.remotes_dir_path pathlib.Path(server_path).mkdir(parents=True, exist_ok=True) - archive_path = os.path.join(server_path, f"{name}.tar.gz") + filename = f"{name}{extension}" + archive_path = os.path.join(server_path, filename) files = list(context.table) if context.table else [{"path": "README.md"}] - create_tar_gz(archive_path, name, files) - context.archive_sha256 = compute_sha256(archive_path) - context.archive_url = pathlib.Path(archive_path).as_uri() + if extension == ".tar.gz": + create_tar_gz(archive_path, name, files) + else: + create_zip(archive_path, name, files) + context.archive_sha256 = _sha256(archive_path) + context.archive_url = _archive_url(context, filename) -@given('an archive "{name}.zip" with the files') -@given('an archive "{name}.zip"') + +@given('an archive "{name}.tar.gz" with the files') +@given('an archive "{name}.tar.gz"') def step_impl(context, name): - server_path = context.remotes_dir_path - pathlib.Path(server_path).mkdir(parents=True, exist_ok=True) + _create_archive(context, name, ".tar.gz") - archive_path = os.path.join(server_path, f"{name}.zip") - files = list(context.table) if context.table else [{"path": "README.md"}] - create_zip(archive_path, name, files) - context.archive_sha256 = compute_sha256(archive_path) - context.archive_url = pathlib.Path(archive_path).as_uri() +@given('an archive "{name}.zip" with the files') +@given('an archive "{name}.zip"') +def step_impl(context, name): + _create_archive(context, name, ".zip") diff --git a/features/validate-manifest.feature b/features/validate-manifest.feature index bceb0e89..7ed0e8e3 100644 --- a/features/validate-manifest.feature +++ b/features/validate-manifest.feature @@ -89,11 +89,9 @@ Feature: Validate a manifest """ Dfetch (0.12.1) Schema validation failed: - - hash: not-a-valid-hash - ^ (line: 9) - - when expecting a string matching ^(sha256):[a-fA-F0-9]+$ + hash: not-a-valid-hash + ^ (line: 8) + found non-matching string """ Scenario: A manifest with duplicate project names From 3203c756a82b47aaec449ebc6af7e4b548ca2d63 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 11:33:43 +0000 Subject: [PATCH 06/35] Move archive hash into integrity block in manifest (#23) Replaces the flat ``hash: sha256:`` field with a nested ``integrity:`` block, designed for future extension: integrity: hash: sha256: # future: sig: (.sig / .asc detached signature) # future: sig-key: (.p7s / .gpg signing key) Changes: - schema.py: add ``INTEGRITY_MAP`` (``Map`` with ``Optional("hash")`` and placeholder comments for ``sig`` / ``sig-key``); replace the flat ``Optional("hash")`` in ``PROJECT_SCHEMA`` - project.py: add ``Integrity`` dataclass with ``hash`` field, ``__bool__``, and ``as_yaml()``; replace ``_hash`` with ``_integrity: Integrity``; keep ``hash`` property as a convenience accessor so all downstream code (archivesubproject, report, purl) requires no changes; update ``as_yaml()`` to emit ``integrity:`` block - Update docstrings in archive.py and archivesubproject.py - Update all archive feature files to use the new format https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/manifest/project.py | 66 +++++++++++++++++++++------- dfetch/manifest/schema.py | 13 +++++- dfetch/project/archivesubproject.py | 21 +++++---- dfetch/vcs/archive.py | 10 +++-- features/check-archive.feature | 11 ++--- features/fetch-archive.feature | 11 +++-- features/freeze-archive.feature | 16 ++++--- features/report-sbom-archive.feature | 10 +++-- features/validate-manifest.feature | 14 +++--- 9 files changed, 118 insertions(+), 54 deletions(-) diff --git a/dfetch/manifest/project.py b/dfetch/manifest/project.py index 7d144a0f..d5c8dd36 100644 --- a/dfetch/manifest/project.py +++ b/dfetch/manifest/project.py @@ -239,11 +239,13 @@ vcs: archive url: https://example.com/releases/my-library-1.0.tar.gz -Hash verification -***************** -Use the ``hash:`` attribute to verify the integrity of the downloaded archive. -The format is ``:``. Only ``sha256`` is supported -today; the format is designed to be extended (``sha512``, etc.). +Integrity verification +********************** +Use the ``integrity:`` block to verify the integrity of the downloaded archive. +Currently the ``hash:`` sub-field is supported (format ``:``); +only ``sha256`` is recognised today, but the block is designed to grow to support +detached signature verification via ``sig:`` (signature URL) and ``sig-key:`` +(signing-key URL or fingerprint) in the future. .. code-block:: yaml @@ -254,7 +256,8 @@ - name: my-library vcs: archive url: https://example.com/releases/my-library-1.0.tar.gz - hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + integrity: + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 Run ``dfetch freeze`` after an initial ``dfetch update`` to add the sha256 hash to the manifest automatically. @@ -317,6 +320,7 @@ import copy from collections.abc import Sequence +from dataclasses import dataclass, field from typing_extensions import Required, TypedDict @@ -324,6 +328,32 @@ from dfetch.manifest.version import Version from dfetch.util.util import always_str_list, str_if_possible + +@dataclass +class Integrity: + """Integrity verification data for an archive dependency. + + Holds the ``hash:`` sub-field today and is designed to accommodate + future signature-verification fields: + + * ``sig`` – URL of a detached signature file (``.sig`` / ``.asc``). + * ``sig_key`` – URL or fingerprint of the signing key (``.p7s`` / ``.gpg``). + """ + + hash: str = field(default="") + + def __bool__(self) -> bool: + """Return *True* when any integrity data is present.""" + return bool(self.hash) + + def as_yaml(self) -> dict[str, str]: + """Serialise to a YAML-compatible dict, omitting empty fields.""" + data: dict[str, str] = {} + if self.hash: + data["hash"] = self.hash + return data + + ProjectEntryDict = TypedDict( "ProjectEntryDict", { @@ -340,7 +370,7 @@ "repo-path": str, "vcs": str, "ignore": Sequence[str], - "hash": str, + "integrity": dict, "default_remote": str, }, total=False, @@ -368,7 +398,8 @@ def __init__(self, kwargs: ProjectEntryDict) -> None: self._tag: str = kwargs.get("tag", "") self._vcs: str = kwargs.get("vcs", "") self._ignore: Sequence[str] = kwargs.get("ignore", []) - self._hash: str = kwargs.get("hash", "") + integrity_data: dict = kwargs.get("integrity", {}) + self._integrity = Integrity(hash=integrity_data.get("hash", "")) if not self._remote and not self._url: self._remote = kwargs.get("default_remote", "") @@ -485,15 +516,20 @@ def ignore(self) -> Sequence[str]: """Get the list of files/folders to ignore from this project (relative to src).""" return self._ignore + @property + def integrity(self) -> Integrity: + """Get the integrity verification data for this archive project.""" + return self._integrity + @property def hash(self) -> str: - """Get the expected hash of the archive (format: 'algorithm:hex-value').""" - return self._hash + """Convenience accessor for ``integrity.hash``.""" + return self._integrity.hash @hash.setter def hash(self, value: str) -> None: - """Set the expected hash of the archive.""" - self._hash = value + """Set ``integrity.hash`` (convenience setter used by freeze).""" + self._integrity.hash = value def __repr__(self) -> str: """Get a string representation of this project entry.""" @@ -515,9 +551,9 @@ def as_recommendation(self) -> "ProjectEntry": recommendation._repo_path = "" # pylint: disable=protected-access return recommendation - def as_yaml(self) -> dict[str, str | list[str]]: + def as_yaml(self) -> dict[str, str | list[str] | dict[str, str]]: """Get this project as yaml dictionary.""" - yamldata = { + yamldata: dict[str, str | list[str] | dict[str, str] | None] = { "name": self._name, "revision": self._revision, "remote": self._remote, @@ -529,7 +565,7 @@ def as_yaml(self) -> dict[str, str | list[str]]: "tag": self._tag, "repo-path": self._repo_path, "vcs": self._vcs, - "hash": self._hash, + "integrity": self._integrity.as_yaml() or None, } return {k: v for k, v in yamldata.items() if v} diff --git a/dfetch/manifest/schema.py b/dfetch/manifest/schema.py index 8a9e958c..16a06777 100644 --- a/dfetch/manifest/schema.py +++ b/dfetch/manifest/schema.py @@ -17,6 +17,17 @@ HASH_STR = Regex(r"^(sha256):[a-fA-F0-9]+$") +# ``integrity:`` block — designed for future extension with ``sig:`` and +# ``sig-key:`` fields for detached signature / signing-key verification. +INTEGRITY_MAP = Map( + { + Optional("hash"): HASH_STR, + # Future fields (uncomment when implemented): + # Optional("sig"): SAFE_STR, # detached signature URL (.sig / .asc) + # Optional("sig-key"): SAFE_STR, # signing-key URL or fingerprint (.p7s / .gpg) + } +) + PROJECT_SCHEMA = Map( { "name": SAFE_STR, @@ -31,7 +42,7 @@ Optional("vcs"): Enum(["git", "svn", "archive"]), Optional("src"): SAFE_STR, Optional("ignore"): Seq(SAFE_STR), - Optional("hash"): HASH_STR, + Optional("integrity"): INTEGRITY_MAP, } ) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 9026e2c2..b27022ff 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -10,12 +10,14 @@ * **No hash** – the URL itself acts as the identity. The archive is considered up-to-date as long as the same URL is still reachable. -* **``hash: :``** – the cryptographic hash of the archive file - acts as the version identifier. The fetch step verifies the downloaded - archive against this hash and raises an error on mismatch. +* **``integrity.hash: :``** – the cryptographic hash of the + archive file acts as the version identifier. The fetch step verifies the + downloaded archive against this hash and raises an error on mismatch. -The ``hash:`` field is intended to be extended to additional algorithms in -the future; only ``sha256`` is supported today. +The ``integrity:`` block is designed for future extension: ``sig:`` and +``sig-key:`` fields for detached signature / signing-key verification will +slot in alongside ``hash:`` without breaking existing manifests. +Only ``sha256`` is supported today. Example manifest entries:: @@ -29,7 +31,8 @@ - name: my-library url: https://example.com/releases/my-library-1.0.tar.gz vcs: archive - hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + integrity: + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 .. scenario-include:: ../features/fetch-archive.feature """ @@ -154,9 +157,9 @@ def _list_of_tags(self) -> list[str]: @property def wanted_version(self) -> Version: - """Version derived from the ``hash:`` field or the archive URL. + """Version derived from the ``integrity.hash`` field or the archive URL. - * With ``hash: sha256:`` → ``Version(revision='sha256:')`` + * With ``integrity.hash: sha256:`` → ``Version(revision='sha256:')`` * Without hash → ``Version(revision=)`` This makes the standard :class:`~dfetch.project.subproject.SubProject` @@ -174,7 +177,7 @@ def _fetch_impl(self, version: Version) -> Version: """Download and extract the archive to the local destination. 1. Download the archive to a temporary file. - 2. If ``hash:`` is specified, verify the downloaded file. + 2. If ``integrity.hash`` is specified, verify the downloaded file. 3. Extract to :attr:`local_path`, respecting ``src:`` and ``ignore:``. Raises: diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index b5be80a8..3c01d31e 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -4,9 +4,10 @@ ``.tar.bz2``, ``.tar.xz`` or ``.zip`` archives from any URL that Python's :mod:`urllib.request` can reach (``http://``, ``https://``, ``file://``, …). -Optional integrity checking is supported via a ``hash:`` manifest field -(e.g. ``hash: sha256:``). The ``sha256`` algorithm is supported today; -the format is designed for extension to ``sha512``, ``md5``, etc. +Optional integrity checking is supported via an ``integrity:`` manifest block. +The ``hash:`` sub-field (e.g. ``sha256:``) is supported today; the block +is designed to grow with ``sig:`` and ``sig-key:`` fields for detached +signature / signing-key verification in the future. Example manifest entry:: @@ -14,7 +15,8 @@ - name: my-library url: https://example.com/releases/my-library-1.0.tar.gz vcs: archive - hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + integrity: + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 """ diff --git a/features/check-archive.feature b/features/check-archive.feature index 61b612f0..fa8547d0 100644 --- a/features/check-archive.feature +++ b/features/check-archive.feature @@ -1,10 +1,10 @@ Feature: Checking dependencies from an archive DFetch can check if archive-based projects are up-to-date. - For archives without a hash the URL is the version identifier so the - project is always considered up-to-date once fetched (the URL has not - changed). For archives with a 'hash:' field the hash is the version - identifier, and dfetch reports whether the locally stored version + For archives without an integrity hash the URL is the version identifier + so the project is always considered up-to-date once fetched (the URL has + not changed). For archives with an 'integrity.hash' the hash is the + version identifier, and dfetch reports whether the locally stored version matches the wanted hash. Scenario: Archive project without hash is reported as up-to-date after fetch @@ -41,7 +41,8 @@ Feature: Checking dependencies from an archive - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ And all projects are updated in MyProject When I run "dfetch check" in MyProject diff --git a/features/fetch-archive.feature b/features/fetch-archive.feature index f7f57eb9..564e218a 100644 --- a/features/fetch-archive.feature +++ b/features/fetch-archive.feature @@ -2,8 +2,9 @@ Feature: Fetching dependencies from an archive (tar/zip) Some projects are distributed as tar or zip archives, for example as GitHub release assets or on internal artifact servers. DFetch supports - fetching these archives using the 'archive' vcs type. Optionally, a hash - can be specified with 'hash: :' for integrity verification. + fetching these archives using the 'archive' vcs type. Optionally, an + 'integrity:' block with a 'hash:' sub-field can be specified for + cryptographic integrity verification. Scenario: Tar.gz archive project is fetched Given an archive "SomeProject.tar.gz" with the files @@ -69,7 +70,8 @@ Feature: Fetching dependencies from an archive (tar/zip) - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ When I run "dfetch update" in MyProject Then 'MyProject' looks like: @@ -93,7 +95,8 @@ Feature: Fetching dependencies from an archive (tar/zip) - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256:0000000000000000000000000000000000000000000000000000000000000000 + integrity: + hash: sha256:0000000000000000000000000000000000000000000000000000000000000000 """ When I run "dfetch update" in MyProject Then the output shows diff --git a/features/freeze-archive.feature b/features/freeze-archive.feature index b2255e31..1e5fb3db 100644 --- a/features/freeze-archive.feature +++ b/features/freeze-archive.feature @@ -1,10 +1,11 @@ Feature: Freeze archive dependencies For archive projects, 'dfetch freeze' adds a sha256 hash to the manifest - to pin the exact archive content. This uses the 'hash: sha256:' - format, which can be extended to other algorithms in the future. + to pin the exact archive content. This uses the 'integrity.hash: sha256:' + format, which can be extended to other algorithms or signature fields in + the future. - Archives that already have a hash in the manifest are left unchanged. + Archives that already have an integrity hash in the manifest are left unchanged. Scenario: Archive project is frozen with its sha256 hash Given an archive "SomeProject.tar.gz" with the files @@ -30,7 +31,8 @@ Feature: Freeze archive dependencies - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ @@ -46,7 +48,8 @@ Feature: Freeze archive dependencies - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ And all projects are updated When I run "dfetch freeze" @@ -59,6 +62,7 @@ Feature: Freeze archive dependencies - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ diff --git a/features/report-sbom-archive.feature b/features/report-sbom-archive.feature index 705be1f5..886be973 100644 --- a/features/report-sbom-archive.feature +++ b/features/report-sbom-archive.feature @@ -7,8 +7,8 @@ Feature: Create a CycloneDX SBOM for archive dependencies - A ``generic`` Package URL (PURL) with a ``download_url`` qualifier pointing at the archive. - An external reference of type ``distribution`` (not ``vcs``). - - A ``SHA-256`` component hash when a ``hash:`` field is present in the - manifest, so downstream tooling can verify supply-chain integrity. + - A ``SHA-256`` component hash when an ``integrity.hash`` field is present + in the manifest, so downstream tooling can verify supply-chain integrity. Scenario: A fetched archive without a hash generates a json sbom Given an archive "SomeProject.tar.gz" @@ -53,7 +53,8 @@ Feature: Create a CycloneDX SBOM for archive dependencies - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ And all projects are updated When I run "dfetch report -t sbom" @@ -93,7 +94,8 @@ Feature: Create a CycloneDX SBOM for archive dependencies - name: SomeProject url: some-remote-server/SomeProject.tar.gz vcs: archive - hash: sha256: + integrity: + hash: sha256: """ When I run "dfetch report -t sbom" Then the 'report.json' json file includes diff --git a/features/validate-manifest.feature b/features/validate-manifest.feature index 7ed0e8e3..40358a66 100644 --- a/features/validate-manifest.feature +++ b/features/validate-manifest.feature @@ -51,7 +51,7 @@ Feature: Validate a manifest unexpected key not in schema 'manifest-wrong' """ - Scenario: A valid archive manifest with hash is validated + Scenario: A valid archive manifest with integrity hash is validated Given the manifest 'dfetch.yaml' """ manifest: @@ -61,7 +61,8 @@ Feature: Validate a manifest - name: SomeLib url: https://example.com/SomeLib-1.0.tar.gz vcs: archive - hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + integrity: + hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 """ When I run "dfetch validate" @@ -71,7 +72,7 @@ Feature: Validate a manifest dfetch.yaml : valid """ - Scenario: A manifest with an invalid hash format is rejected + Scenario: A manifest with an invalid integrity hash format is rejected Given the manifest 'dfetch.yaml' """ manifest: @@ -81,7 +82,8 @@ Feature: Validate a manifest - name: SomeLib url: https://example.com/SomeLib-1.0.tar.gz vcs: archive - hash: not-a-valid-hash + integrity: + hash: not-a-valid-hash """ When I run "dfetch validate" @@ -89,8 +91,8 @@ Feature: Validate a manifest """ Dfetch (0.12.1) Schema validation failed: - hash: not-a-valid-hash - ^ (line: 8) + hash: not-a-valid-hash + ^ (line: 9) found non-matching string """ From b693355b9ebd0f5c507db2e0a67653b1544bd67b Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 12:44:05 +0000 Subject: [PATCH 07/35] Update manifest.rst schema doc for integrity block (#23) Replace flat ``hash:`` property with ``integrity:`` object in the JSON-schema-style reference, documenting the ``hash`` sub-field and the planned ``sig``/``sig-key`` extension points. https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- doc/manifest.rst | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/manifest.rst b/doc/manifest.rst index da440c88..3409afd5 100644 --- a/doc/manifest.rst +++ b/doc/manifest.rst @@ -109,12 +109,20 @@ Below an overview of all possible fields on the manifest. The bold items are man description: Files to ignore. See :ref:`Ignore` for details. items: type: string - hash: - type: string + integrity: + type: object description: > - Cryptographic hash of the archive file for integrity verification. - Only used with ``vcs: archive``. Format: ``:``. - Currently ``sha256`` is supported (e.g. ``sha256:e3b0c4…``). - The format is designed for future extension to ``sha512``, etc. + Integrity verification block for archive dependencies. + Only used with ``vcs: archive``. + Designed for future extension with ``sig:`` (detached signature URL) + and ``sig-key:`` (signing-key URL or fingerprint) fields alongside ``hash:``. See :ref:`Archive` for details. + properties: + hash: + type: string + description: > + Cryptographic hash of the archive file. + Format: ``:``. + Currently ``sha256`` is supported (e.g. ``sha256:e3b0c4…``). + The format is designed for future extension to ``sha512``, etc. uniqueItems: true From 9988a3306d500d64f4183178398209204e4eab4f Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 14:47:39 +0000 Subject: [PATCH 08/35] Fix review findings in archive/integrity implementation (#23) Real issues fixed: - schema.py: tighten HASH_STR regex to require exactly 64 hex chars (sha256 produces a fixed-length digest; any-length was too permissive) - archive.py: guard filter="tar" on sys.version_info >= (3,11,4) so extraction works on Python 3.10 / early 3.11 without TypeError; move ARCHIVE_EXTENSIONS before the SubProject import so purl.py can safely import it without triggering a partial-initialisation error - purl.py: remove duplicate _ARCHIVE_EXTENSIONS tuple; import ARCHIVE_EXTENSIONS from dfetch.vcs.archive instead (single source of truth) - freeze.py: update docstring to reference integrity.hash; change freeze_project callers to use the returned version string so the "Frozen on version" message shows the actual sha256: that was written rather than the pre-freeze URL; distinguish download failure (now raises RuntimeError, caught by catch_runtime_exceptions) from "already pinned" (returns None) - subproject.py / archivesubproject.py: update freeze_project signature to return str | None (version written, or None if no change) Docstring / comment fixes: - report.py, subproject.py, freeze.py: replace stale "hash:" references with "integrity.hash" - generic_steps.py: use _apply_context_substitutions() in check_output() so all placeholders (, , future ones) are applied consistently instead of the manual one-off replacement https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/commands/freeze.py | 14 ++++++------ dfetch/commands/report.py | 5 +++-- dfetch/manifest/schema.py | 2 +- dfetch/project/archivesubproject.py | 33 ++++++++++++++++------------- dfetch/project/subproject.py | 20 ++++++++++------- dfetch/util/purl.py | 9 ++++---- dfetch/vcs/archive.py | 32 +++++++++++++++++----------- features/steps/generic_steps.py | 4 +--- 8 files changed, 67 insertions(+), 52 deletions(-) diff --git a/dfetch/commands/freeze.py b/dfetch/commands/freeze.py index b10e4a92..d057c488 100644 --- a/dfetch/commands/freeze.py +++ b/dfetch/commands/freeze.py @@ -36,10 +36,11 @@ .. scenario-include:: ../features/freeze-projects.feature -For archive projects, ``dfetch freeze`` adds a ``hash: sha256:`` field -to pin the exact archive content used. This field acts as the version -identifier: DFetch verifies the downloaded archive against it on every -subsequent ``dfetch update``. +For archive projects, ``dfetch freeze`` adds the hash under the nested +``integrity.hash`` key (e.g. ``integrity.hash: sha256:``) to pin the +exact archive content used. This value acts as the version identifier: +DFetch verifies the downloaded archive against it on every subsequent +``dfetch update``. .. scenario-include:: ../features/freeze-archive.feature @@ -88,7 +89,8 @@ def __call__(self, args: argparse.Namespace) -> None: sub_project = dfetch.project.create_sub_project(project) on_disk_version = sub_project.on_disk_version() - if not sub_project.freeze_project(project): + new_version = sub_project.freeze_project(project) + if new_version is None: if on_disk_version: logger.print_info_line( project.name, @@ -102,7 +104,7 @@ def __call__(self, args: argparse.Namespace) -> None: else: logger.print_info_line( project.name, - f"Freezing on version {on_disk_version}", + f"Frozen on version {new_version}", ) projects.append(project) diff --git a/dfetch/commands/report.py b/dfetch/commands/report.py index 35924301..2b20b4f5 100644 --- a/dfetch/commands/report.py +++ b/dfetch/commands/report.py @@ -110,8 +110,9 @@ def _determine_version(project: ProjectEntry) -> str: For archive projects the sha256 hash (``sha256:``) stored in the metadata *revision* field is used as the version identifier. When no - metadata is present yet, the ``hash:`` field from the manifest is used - as fallback so the SBOM can still be generated before the first fetch. + metadata is present yet, the ``integrity.hash`` field from the manifest + is used as fallback so the SBOM can still be generated before the first + fetch. """ try: metadata = Metadata.from_file(Metadata.from_project_entry(project).path) diff --git a/dfetch/manifest/schema.py b/dfetch/manifest/schema.py index 16a06777..f9dda64a 100644 --- a/dfetch/manifest/schema.py +++ b/dfetch/manifest/schema.py @@ -15,7 +15,7 @@ } ) -HASH_STR = Regex(r"^(sha256):[a-fA-F0-9]+$") +HASH_STR = Regex(r"^(sha256):[a-fA-F0-9]{64}$") # ``integrity:`` block — designed for future extension with ``sig:`` and # ``sig-key:`` fields for detached signature / signing-key verification. diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index b27022ff..4b2f22f0 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -227,40 +227,43 @@ def _fetch_impl(self, version: Version) -> Version: # Freeze support # ------------------------------------------------------------------ - def freeze_project(self, project: ProjectEntry) -> bool: + def freeze_project(self, project: ProjectEntry) -> str | None: """Pin *project* to a cryptographic hash of the archive. * If the archive was already fetched with a hash, the on-disk revision - (``sha256:``) is written to the manifest. + (``sha256:``) is written to ``integrity.hash`` in the manifest. * If the archive was fetched without a hash (URL-only), the archive is downloaded again, its SHA-256 is computed, and the result is written - to the manifest. This ensures the manifest always ends up pinned to - a specific content fingerprint. + to ``integrity.hash``. This ensures the manifest always ends up + pinned to a specific content fingerprint. Returns: - *True* when the manifest entry was modified, *False* otherwise. + The ``sha256:`` string written to *project*, or *None* if the + manifest was already up-to-date. + + Raises: + RuntimeError: On download or hash-computation failure so the caller + can log a meaningful error rather than silently claiming the + project is already pinned. """ on_disk = self.on_disk_version() if not on_disk: - return False + return None revision = on_disk.revision # Already hash-pinned – revision is "sha256:" if revision.startswith(tuple(f"{a}:" for a in SUPPORTED_HASH_ALGORITHMS)): if project.hash == revision: - return False + return None project.hash = revision - return True + return revision # URL-pinned: download the archive now and compute its hash. - try: - hex_value = self._download_and_compute_hash("sha256") - except RuntimeError: - return False - + # Raises RuntimeError on failure so the caller (freeze.py) can log it. + hex_value = self._download_and_compute_hash("sha256") new_hash = f"sha256:{hex_value}" if project.hash == new_hash: - return False + return None project.hash = new_hash - return True + return new_hash diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index 6d5fc5a7..0812d0bb 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -388,25 +388,29 @@ def _fetch_impl(self, version: Version) -> Version: def get_default_branch(self) -> str: """Get the default branch of this repository.""" - def freeze_project(self, project: ProjectEntry) -> bool: + def freeze_project(self, project: ProjectEntry) -> str | None: """Freeze *project* to its current on-disk version. Subclasses may override this to apply VCS-specific freeze logic (e.g. :class:`~dfetch.project.archivesubproject.ArchiveSubProject` stores - the hash in the ``hash:`` field rather than ``revision:``). + the hash under ``integrity.hash`` rather than ``revision:``). Returns: - *True* when the manifest entry was modified, *False* if the entry - was already pinned to the on-disk version or no on-disk version - could be determined. + The version string that was written to *project* when a change was + made, or *None* if the entry was already pinned to the on-disk + version or no on-disk version could be determined. + + Raises: + RuntimeError: When VCS-specific freeze logic fails (e.g. archive + download error). Callers should catch and report these. """ on_disk_version = self.on_disk_version() if project.version == on_disk_version: - return False + return None if on_disk_version: project.version = on_disk_version - return True - return False + return on_disk_version.revision or on_disk_version.tag or str(on_disk_version) + return None @staticmethod def is_license_file(filename: str) -> bool: diff --git a/dfetch/util/purl.py b/dfetch/util/purl.py index 81367c4d..deba1272 100644 --- a/dfetch/util/purl.py +++ b/dfetch/util/purl.py @@ -10,6 +10,8 @@ from packageurl import PackageURL from tldextract import TLDExtract +from dfetch.vcs.archive import ARCHIVE_EXTENSIONS + # Although tldextract can fetch the latest suffix list, we don't want that here NO_FETCH_EXTRACT = TLDExtract(suffix_list_urls=(), extra_suffixes=("local",)) @@ -36,9 +38,6 @@ # These domains have no specific Purl type, but adding the domain to the purl doesn't add any value EXCLUDED_DOMAINS = ["gitlab", "gitea", "gitee", "sf", "gnu"] -# Archive file extensions recognised as downloadable archive artifacts -_ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") - # Map from dfetch hash-field algorithm prefix to CycloneDX HashAlgorithm name DFETCH_TO_CDX_HASH_ALGORITHM: dict[str, str] = { "sha256": "SHA-256", @@ -51,14 +50,14 @@ def _is_archive_url(url: str) -> bool: """Return *True* when *url* points to a recognised archive file.""" lower = url.lower().split("?")[0] # strip query string before checking extension - return any(lower.endswith(ext) for ext in _ARCHIVE_EXTENSIONS) + return any(lower.endswith(ext) for ext in ARCHIVE_EXTENSIONS) def _strip_archive_extension(name: str) -> str: """Remove a recognised archive extension from *name*.""" lower = name.lower() # Check multi-part extensions first (.tar.gz etc.) - for ext in _ARCHIVE_EXTENSIONS: + for ext in ARCHIVE_EXTENSIONS: if lower.endswith(ext): return name[: -len(ext)] return name diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 3c01d31e..3db6d54f 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -27,6 +27,7 @@ import os import pathlib import shutil +import sys import tarfile import tempfile import urllib.error @@ -34,18 +35,21 @@ import zipfile from collections.abc import Sequence -from dfetch.log import get_logger -from dfetch.project.subproject import SubProject -from dfetch.util.util import find_matching_files, safe_rm - -logger = get_logger(__name__) - #: Archive file extensions recognised by DFetch. +#: Defined before any intra-package imports to avoid partial-initialisation +#: issues when other modules (e.g. dfetch.util.purl) import this symbol while +#: the module is still being initialised. ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") -#: Hash algorithms supported by the ``hash:`` manifest field. +#: Hash algorithms supported by the ``integrity.hash`` manifest field. SUPPORTED_HASH_ALGORITHMS = ("sha256",) +from dfetch.log import get_logger # noqa: E402 +from dfetch.project.subproject import SubProject # noqa: E402 +from dfetch.util.util import find_matching_files, safe_rm # noqa: E402 + +logger = get_logger(__name__) + # Safety limits applied during extraction to prevent decompression bombs. _MAX_UNCOMPRESSED_BYTES = 500 * 1024 * 1024 # 500 MB _MAX_MEMBER_COUNT = 10_000 @@ -241,10 +245,11 @@ def _extract_raw(archive_path: str, dest_dir: str) -> None: Safety checks performed before extraction: * TAR: member count and total uncompressed size (decompression bomb). - Path sanitisation is handled by the built-in ``filter="tar"`` filter - (available from Python 3.11.4 / 3.12 as a security backport) which - rejects absolute paths, ``..`` components, absolute symlinks, and - device files. + Path sanitisation uses the built-in ``filter="tar"`` filter when + available (Python ≥ 3.11.4 / 3.12), which rejects absolute paths, + ``..`` components, absolute symlinks, and device files. On older + Python releases extraction proceeds without the filter (member-path + attacks are still blocked by ``_check_tar_members``). * ZIP: member path traversal validation (absolute paths and ``..`` components are rejected) plus member count and size limits. """ @@ -252,7 +257,10 @@ def _extract_raw(archive_path: str, dest_dir: str) -> None: if tarfile.is_tarfile(archive_path) and not lower.endswith(".zip"): with tarfile.open(archive_path, "r:*") as tf: ArchiveLocalRepo._check_tar_members(tf) - tf.extractall(dest_dir, filter="tar") + if sys.version_info >= (3, 11, 4): + tf.extractall(dest_dir, filter="tar") + else: + tf.extractall(dest_dir) # noqa: S202 elif lower.endswith(".zip") or zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as zf: ArchiveLocalRepo._check_zip_members(zf) diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 0651d5ce..ed06cb10 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -243,9 +243,7 @@ def check_output(context, line_count=None): context: Behave context with cmd_output and expected text line_count: If set, compare only the first N lines of actual output """ - expected_raw = context.text - if hasattr(context, "archive_sha256"): - expected_raw = expected_raw.replace("", context.archive_sha256) + expected_raw = _apply_context_substitutions(context.text, context) expected_text = multisub( patterns=[ From ca20833a3334017ec5c5b25a3b4c56ae31868695 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 15:16:26 +0000 Subject: [PATCH 09/35] Apply security and correctness fixes from review (#23) Inline (must-fix): - archivesubproject._fetch_impl: guard expected_hash.split(":", 1) with an explicit ":" check and raise RuntimeError with context if the integrity.hash field is malformed (defence-in-depth; schema validates on the normal path but not when loaded programmatically). Nitpicks (verified against current code): - archive.is_accessible(): handle file:// URLs via os.path.exists instead of urllib (no network round-trip); fall back from HEAD to a partial GET (Range: bytes=0-0) for http/https servers that reject HEAD with 405/501. - archive._check_tar_members: add path-traversal validation (absolute paths, ".." components) to match _check_zip_members; defence-in-depth for Python < 3.11.4 where filter="tar" is unavailable. - archivesubproject imports: add inline comment on _safe_compare_hex and _suffix_for_url to document that private symbols are imported deliberately for intra-package use. - archivesubproject._does_revision_exist: document in the docstring that a hash-pinned revision triggers a full archive download. - archive_steps._archive_url: use pathlib.Path.as_posix() instead of split/join on os.sep so mixed separators are normalised correctly. Already fixed (skip): freeze.py docstring/caller, freeze_project return type, HASH_STR {64}, sys.version_info filter guard, purl.py ARCHIVE_EXTENSIONS import, report.py / archivesubproject.py docstrings, generic_steps.py _apply_context_substitutions usage. https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/project/archivesubproject.py | 15 +++++--- dfetch/vcs/archive.py | 53 +++++++++++++++++++++++------ features/steps/archive_steps.py | 7 ++-- 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 4b2f22f0..34e8a9be 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -51,8 +51,8 @@ SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, - _safe_compare_hex, - _suffix_for_url, + _safe_compare_hex, # private helper, intentionally imported for internal use + _suffix_for_url, # private helper, intentionally imported for internal use compute_hash, is_archive_url, ) @@ -131,9 +131,11 @@ def _does_revision_exist(self, revision: str) -> bool: """Check whether *revision* (a hash or URL string) is still valid. * If *revision* starts with a known hash algorithm prefix (e.g. - ``sha256:``) the archive is downloaded and the hash verified. + ``sha256:``) **the entire archive is downloaded** to a temporary file + and its hash is verified against *revision*. This is intentionally + thorough — a lightweight HEAD check cannot confirm content integrity. * Otherwise *revision* is treated as the URL itself and a lightweight - reachability check is performed. + reachability check is performed via :meth:`ArchiveRemote.is_accessible`. """ for algo in SUPPORTED_HASH_ALGORITHMS: if revision.startswith(f"{algo}:"): @@ -199,6 +201,11 @@ def _fetch_impl(self, version: Version) -> Version: self._remote_repo.download(tmp_path) if expected_hash: + if ":" not in expected_hash: + raise RuntimeError( + f"Malformed integrity.hash for {self._project_entry.name!r}: " + f"expected ':', got {expected_hash!r}" + ) algorithm, expected_hex = expected_hash.split(":", 1) actual_hex = compute_hash(tmp_path, algorithm) if not _safe_compare_hex(actual_hex, expected_hex): diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 3db6d54f..f3d1677c 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -107,16 +107,33 @@ def __init__(self, url: str) -> None: def is_accessible(self) -> bool: """Return *True* when the archive URL is reachable. - Sends a lightweight ``HEAD`` request for ``http``/``https`` URLs and - tests existence for ``file://`` URLs. Returns *False* on any network - or I/O error. + * ``file://`` URLs are checked with :func:`os.path.exists` directly — + no network round-trip needed. + * ``http``/``https`` URLs first try a ``HEAD`` request. If the server + rejects it (405/501) a partial ``GET`` (``Range: bytes=0-0``) is + attempted instead. Returns *False* on any final failure. """ - try: - parsed = urllib.request.Request(self.url, method="HEAD") - with urllib.request.urlopen(parsed, timeout=15): - return True - except (urllib.error.URLError, OSError, ValueError): - return False + from urllib.parse import urlparse as _urlparse # noqa: PLC0415 + + if _urlparse(self.url).scheme == "file": + path = _urlparse(self.url).path + return os.path.exists(path) + + for method, headers in [ + ("HEAD", {}), + ("GET", {"Range": "bytes=0-0"}), + ]: + try: + req = urllib.request.Request(self.url, method=method, headers=headers) + with urllib.request.urlopen(req, timeout=15): + return True + except urllib.error.HTTPError as exc: + if exc.code in (405, 501): # Method Not Allowed / Not Implemented + continue + return False + except (urllib.error.URLError, OSError, ValueError): + return False + return False def download(self, dest_path: str) -> None: """Download the archive to *dest_path*. @@ -228,15 +245,29 @@ def _check_zip_members(zf: zipfile.ZipFile) -> None: @staticmethod def _check_tar_members(tf: tarfile.TarFile) -> None: - """Validate TAR member count and total size against decompression bombs. + """Validate TAR members against decompression bombs and path traversal. + + Size/count limits mirror :meth:`_check_zip_members`. Path validation + is defence-in-depth: on Python ≥ 3.11.4 the ``filter="tar"`` passed to + :meth:`tarfile.TarFile.extractall` also rejects unsafe paths, but we + check here too so the guard applies on all supported Python versions. Raises: - RuntimeError: When the archive exceeds the size/count limits. + RuntimeError: When the archive exceeds the size/count limits or + contains an absolute path or ``..`` component. """ members = tf.getmembers() ArchiveLocalRepo._check_archive_limits( len(members), sum(m.size for m in members if m.isfile()) ) + for member in members: + member_path = pathlib.PurePosixPath(member.name) + if member_path.is_absolute() or any( + part == ".." for part in member_path.parts + ): + raise RuntimeError( + f"Archive contains an unsafe member path: {member.name!r}" + ) @staticmethod def _extract_raw(archive_path: str, dest_dir: str) -> None: diff --git a/features/steps/archive_steps.py b/features/steps/archive_steps.py index 2b8d35a0..f1817ed9 100644 --- a/features/steps/archive_steps.py +++ b/features/steps/archive_steps.py @@ -48,9 +48,12 @@ def _archive_url(context, filename: str) -> str: apply_manifest_substitutions produces ``file:///`` + absolute path, which for an absolute path like ``/tmp/...`` yields four slashes (``file:////tmp/...``). We must match that format so placeholder substitution works in SBOM assertions. + + :func:`pathlib.Path.as_posix` is used instead of :func:`str.split`/join so + that mixed separators (e.g. on Windows) are normalised correctly. """ - server_fwd = "/".join(context.remotes_dir_path.split(os.sep)) - return f"file:///{server_fwd}/{filename}" + server_posix = pathlib.Path(context.remotes_dir_path).as_posix() + return f"file:///{server_posix}/{filename}" def _create_archive(context, name: str, extension: str) -> None: From a34a3943297494254143b4364dff047eafc770f1 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 16:33:40 +0000 Subject: [PATCH 10/35] Add unit tests for archive VCS, integrity block, and PURL; update docs (#23) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - tests/test_archive.py: unit tests for compute_hash, _safe_compare_hex, is_archive_url, _suffix_for_url, _check_archive_limits, _check_zip_members, _check_tar_members (including path-traversal rejection), ArchiveRemote.is_accessible, and ArchiveLocalRepo.extract (strip top-level dir, src filter, zip) - tests/test_integrity.py: unit tests for Integrity dataclass (__bool__, as_yaml) and ProjectEntry integrity field (from dict, accessor, setter, as_yaml) - tests/test_purl.py: archive URL → PURL attribute-based tests (type, name, namespace, download_url qualifier; no vcs_url) for all supported extensions - dfetch/project/archivesubproject.py: add scenario-include for freeze-archive.feature - dfetch/reporting/sbom_reporter.py: fix stale 'hash:' → 'integrity.hash:' in docstring; add scenario-include for no-hash SBOM scenario https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/project/archivesubproject.py | 2 + dfetch/reporting/sbom_reporter.py | 8 +- tests/test_archive.py | 326 ++++++++++++++++++++++++++++ tests/test_integrity.py | 73 +++++++ tests/test_purl.py | 63 ++++++ 5 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 tests/test_archive.py create mode 100644 tests/test_integrity.py diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 34e8a9be..13b7cc06 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -35,6 +35,8 @@ hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 .. scenario-include:: ../features/fetch-archive.feature + +.. scenario-include:: ../features/freeze-archive.feature """ from __future__ import annotations diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 4892768b..18dd295f 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -17,8 +17,12 @@ Archive dependencies -------------------- Archive dependencies (tar.gz, zip, …) are recorded with a ``distribution`` -external reference and, when a ``hash:`` field is set, a ``SHA-256`` component -hash for supply-chain integrity verification. +external reference and, when an ``integrity.hash:`` field is set, a ``SHA-256`` +component hash for supply-chain integrity verification. + +.. scenario-include:: ../features/report-sbom-archive.feature + :scenario: + A fetched archive without a hash generates a json sbom .. scenario-include:: ../features/report-sbom-archive.feature :scenario: diff --git a/tests/test_archive.py b/tests/test_archive.py new file mode 100644 index 00000000..05389e77 --- /dev/null +++ b/tests/test_archive.py @@ -0,0 +1,326 @@ +"""Unit tests for dfetch.vcs.archive.""" + +import io +import os +import tarfile +import tempfile +import zipfile + +import pytest + +import dfetch.project # noqa: F401 – must be imported before dfetch.vcs.archive to break circular init +from dfetch.vcs.archive import ( + ARCHIVE_EXTENSIONS, + SUPPORTED_HASH_ALGORITHMS, + ArchiveLocalRepo, + ArchiveRemote, + _safe_compare_hex, + _suffix_for_url, + compute_hash, + is_archive_url, +) + +# These are static methods on ArchiveLocalRepo +_check_archive_limits = ArchiveLocalRepo._check_archive_limits +_check_zip_members = ArchiveLocalRepo._check_zip_members +_check_tar_members = ArchiveLocalRepo._check_tar_members + + +# --------------------------------------------------------------------------- +# compute_hash +# --------------------------------------------------------------------------- + + +def test_compute_hash_empty_file(): + with tempfile.NamedTemporaryFile(delete=False) as f: + path = f.name + try: + digest = compute_hash(path, "sha256") + # SHA-256 of empty string + assert digest == "e3b0c44298fc1c149afbf4c8996fb924" "27ae41e4649b934ca495991b7852b855" + finally: + os.remove(path) + + +def test_compute_hash_known_content(): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"hello world\n") + path = f.name + try: + digest = compute_hash(path, "sha256") + assert len(digest) == 64 + assert all(c in "0123456789abcdef" for c in digest) + finally: + os.remove(path) + + +def test_compute_hash_unsupported_algorithm(): + with tempfile.NamedTemporaryFile(delete=False) as f: + path = f.name + try: + with pytest.raises(RuntimeError, match="Unsupported hash algorithm"): + compute_hash(path, "md5") + finally: + os.remove(path) + + +def test_compute_hash_default_is_sha256(): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"data") + path = f.name + try: + digest = compute_hash(path) + assert len(digest) == 64 + finally: + os.remove(path) + + +# --------------------------------------------------------------------------- +# _safe_compare_hex +# --------------------------------------------------------------------------- + + +def test_safe_compare_hex_equal(): + h = "a" * 64 + assert _safe_compare_hex(h, h) is True + + +def test_safe_compare_hex_case_insensitive(): + assert _safe_compare_hex("ABCDEF", "abcdef") is True + + +def test_safe_compare_hex_not_equal(): + assert _safe_compare_hex("a" * 64, "b" * 64) is False + + +# --------------------------------------------------------------------------- +# is_archive_url +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "url", + [ + "https://example.com/lib.tar.gz", + "https://example.com/lib.tgz", + "https://example.com/lib.tar.bz2", + "https://example.com/lib.tar.xz", + "https://example.com/lib.zip", + "file:///tmp/lib.ZIP", # case-insensitive + ], +) +def test_is_archive_url_true(url): + assert is_archive_url(url) is True + + +@pytest.mark.parametrize( + "url", + [ + "https://example.com/repo.git", + "https://example.com/", + "svn://svn.example.com/trunk", + "https://example.com/lib.tar.gz.sig", + ], +) +def test_is_archive_url_false(url): + assert is_archive_url(url) is False + + +# --------------------------------------------------------------------------- +# _suffix_for_url +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "url,expected", + [ + ("https://example.com/lib.tar.gz", ".tar.gz"), + ("https://example.com/lib.tgz", ".tgz"), + ("https://example.com/lib.tar.bz2", ".tar.bz2"), + ("https://example.com/lib.tar.xz", ".tar.xz"), + ("https://example.com/lib.zip", ".zip"), + ("https://example.com/lib.unknown", ".archive"), + ], +) +def test_suffix_for_url(url, expected): + assert _suffix_for_url(url) == expected + + +def test_suffix_for_url_prefers_longest_match(): + # .tar.gz should win over .gz + assert _suffix_for_url("https://example.com/lib.tar.gz") == ".tar.gz" + + +# --------------------------------------------------------------------------- +# _check_archive_limits +# --------------------------------------------------------------------------- + + +def test_check_archive_limits_ok(): + _check_archive_limits(member_count=1, total_bytes=1024) # should not raise + + +def test_check_archive_limits_too_many_members(): + with pytest.raises(RuntimeError, match="safety limit"): + _check_archive_limits(member_count=10_001, total_bytes=0) + + +def test_check_archive_limits_too_large(): + with pytest.raises(RuntimeError, match="safety limit"): + _check_archive_limits(member_count=1, total_bytes=500 * 1024 * 1024 + 1) + + +# --------------------------------------------------------------------------- +# _check_zip_members +# --------------------------------------------------------------------------- + + +def _make_zip(member_names: list[str]) -> zipfile.ZipFile: + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + for name in member_names: + zf.writestr(name, "content") + buf.seek(0) + return zipfile.ZipFile(buf) + + +def test_check_zip_members_safe(): + zf = _make_zip(["project/README.md", "project/src/main.c"]) + _check_zip_members(zf) # should not raise + + +def test_check_zip_members_dot_dot(): + zf = _make_zip(["project/../etc/passwd"]) + with pytest.raises(RuntimeError, match="unsafe member path"): + _check_zip_members(zf) + + +def test_check_zip_members_absolute(): + zf = _make_zip(["/etc/passwd"]) + with pytest.raises(RuntimeError, match="unsafe member path"): + _check_zip_members(zf) + + +# --------------------------------------------------------------------------- +# _check_tar_members +# --------------------------------------------------------------------------- + + +def _make_tar(member_names: list[str]) -> tarfile.TarFile: + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tf: + for name in member_names: + content = b"content" + info = tarfile.TarInfo(name=name) + info.size = len(content) + tf.addfile(info, io.BytesIO(content)) + buf.seek(0) + return tarfile.open(fileobj=buf, mode="r:gz") + + +def test_check_tar_members_safe(): + tf = _make_tar(["project/README.md", "project/src/main.c"]) + _check_tar_members(tf) # should not raise + + +def test_check_tar_members_dot_dot(): + tf = _make_tar(["project/../etc/passwd"]) + with pytest.raises(RuntimeError, match="unsafe member path"): + _check_tar_members(tf) + + +def test_check_tar_members_absolute(): + tf = _make_tar(["/etc/passwd"]) + with pytest.raises(RuntimeError, match="unsafe member path"): + _check_tar_members(tf) + + +# --------------------------------------------------------------------------- +# ArchiveRemote.is_accessible +# --------------------------------------------------------------------------- + + +def test_is_accessible_existing_file(): + with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as f: + path = f.name + try: + url = f"file:///{path.lstrip('/')}" + remote = ArchiveRemote(url) + assert remote.is_accessible() is True + finally: + os.remove(path) + + +def test_is_accessible_missing_file(): + remote = ArchiveRemote("file:////nonexistent/path/lib.tar.gz") + assert remote.is_accessible() is False + + +# --------------------------------------------------------------------------- +# ArchiveLocalRepo.extract – basic smoke test +# --------------------------------------------------------------------------- + + +def _make_tar_gz_file(archive_path: str, members: dict[str, bytes]) -> None: + with tarfile.open(archive_path, "w:gz") as tf: + for name, content in members.items(): + info = tarfile.TarInfo(name=name) + info.size = len(content) + tf.addfile(info, io.BytesIO(content)) + + +def test_extract_tar_gz_strips_top_level_dir(): + with tempfile.TemporaryDirectory() as tmp: + archive_path = os.path.join(tmp, "lib.tar.gz") + _make_tar_gz_file( + archive_path, + { + "lib-1.0/README.md": b"hello", + "lib-1.0/src/main.c": b"int main(){}", + }, + ) + dest = os.path.join(tmp, "dest") + ArchiveLocalRepo.extract(archive_path, dest) + assert os.path.isfile(os.path.join(dest, "README.md")) + assert os.path.isfile(os.path.join(dest, "src", "main.c")) + + +def test_extract_tar_gz_with_src_filter(): + with tempfile.TemporaryDirectory() as tmp: + archive_path = os.path.join(tmp, "lib.tar.gz") + _make_tar_gz_file( + archive_path, + { + "lib-1.0/README.md": b"readme", + "lib-1.0/src/main.c": b"main", + "lib-1.0/tests/test.c": b"test", + }, + ) + dest = os.path.join(tmp, "dest") + ArchiveLocalRepo.extract(archive_path, dest, src="src") + assert os.path.isfile(os.path.join(dest, "main.c")) + assert not os.path.exists(os.path.join(dest, "tests")) + # License-like files are not present in this archive so no extra files expected + + +def test_extract_zip(): + with tempfile.TemporaryDirectory() as tmp: + archive_path = os.path.join(tmp, "lib.zip") + with zipfile.ZipFile(archive_path, "w") as zf: + zf.writestr("lib-1.0/README.md", "hello") + zf.writestr("lib-1.0/src/main.c", "int main(){}") + dest = os.path.join(tmp, "dest") + ArchiveLocalRepo.extract(archive_path, dest) + assert os.path.isfile(os.path.join(dest, "README.md")) + assert os.path.isfile(os.path.join(dest, "src", "main.c")) + + +def test_all_archive_extensions_covered(): + """Ensure ARCHIVE_EXTENSIONS is a non-empty tuple of dot-prefixed strings.""" + assert len(ARCHIVE_EXTENSIONS) > 0 + for ext in ARCHIVE_EXTENSIONS: + assert ext.startswith(".") + + +def test_supported_hash_algorithms(): + assert "sha256" in SUPPORTED_HASH_ALGORITHMS diff --git a/tests/test_integrity.py b/tests/test_integrity.py new file mode 100644 index 00000000..72398190 --- /dev/null +++ b/tests/test_integrity.py @@ -0,0 +1,73 @@ +"""Unit tests for the Integrity dataclass and ProjectEntry integrity fields.""" + +from dfetch.manifest.project import Integrity, ProjectEntry + + +# --------------------------------------------------------------------------- +# Integrity dataclass +# --------------------------------------------------------------------------- + + +def test_integrity_empty_is_falsy(): + assert not Integrity() + + +def test_integrity_with_hash_is_truthy(): + assert Integrity(hash="sha256:" + "a" * 64) + + +def test_integrity_as_yaml_empty(): + assert Integrity().as_yaml() == {} + + +def test_integrity_as_yaml_with_hash(): + h = "sha256:" + "a" * 64 + assert Integrity(hash=h).as_yaml() == {"hash": h} + + +# --------------------------------------------------------------------------- +# ProjectEntry with integrity block +# --------------------------------------------------------------------------- + + +def test_projectentry_hash_from_integrity_block(): + h = "sha256:" + "b" * 64 + project = ProjectEntry({"name": "lib", "integrity": {"hash": h}}) + assert project.hash == h + + +def test_projectentry_hash_empty_by_default(): + project = ProjectEntry({"name": "lib"}) + assert project.hash == "" + + +def test_projectentry_integrity_truthy_with_hash(): + h = "sha256:" + "c" * 64 + project = ProjectEntry({"name": "lib", "integrity": {"hash": h}}) + assert project.integrity + + +def test_projectentry_integrity_falsy_without_hash(): + project = ProjectEntry({"name": "lib", "integrity": {}}) + assert not project.integrity + + +def test_projectentry_as_yaml_includes_integrity(): + h = "sha256:" + "d" * 64 + project = ProjectEntry({"name": "lib", "url": "https://example.com/lib.tar.gz", "vcs": "archive", "integrity": {"hash": h}}) + yaml_data = project.as_yaml() + assert yaml_data["integrity"] == {"hash": h} + + +def test_projectentry_as_yaml_omits_empty_integrity(): + project = ProjectEntry({"name": "lib"}) + yaml_data = project.as_yaml() + assert "integrity" not in yaml_data + + +def test_projectentry_hash_setter(): + project = ProjectEntry({"name": "lib", "url": "https://example.com/lib.tar.gz", "vcs": "archive"}) + h = "sha256:" + "e" * 64 + project.hash = h + assert project.hash == h + assert project.integrity.hash == h diff --git a/tests/test_purl.py b/tests/test_purl.py index a96aa02c..5c417329 100644 --- a/tests/test_purl.py +++ b/tests/test_purl.py @@ -122,3 +122,66 @@ def test_remote_url_to_purl(url, expected): assert purl is None else: assert str(purl) == expected + + +# --------------------------------------------------------------------------- +# Archive URL → PURL (attribute-based to avoid percent-encoding sensitivity) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "url,expected_name,expected_namespace,expected_download_url", + [ + ( + "https://example.com/releases/mylib-1.0.tar.gz", + "mylib-1.0", + "example.com", + "https://example.com/releases/mylib-1.0.tar.gz", + ), + ( + "https://example.com/lib.zip", + "lib", + "example.com", + "https://example.com/lib.zip", + ), + ( + "https://releases.example.com/project-2.1.tar.bz2", + "project-2.1", + "releases.example.com", + "https://releases.example.com/project-2.1.tar.bz2", + ), + ( + "https://example.com/lib.tgz", + "lib", + "example.com", + "https://example.com/lib.tgz", + ), + ( + "https://example.com/lib.tar.xz", + "lib", + "example.com", + "https://example.com/lib.tar.xz", + ), + ( + "file:///tmp/local-archive.tar.gz", + "local-archive", + "", # no hostname for file:// URLs + "file:///tmp/local-archive.tar.gz", + ), + ], +) +def test_archive_url_to_purl_attributes( + url, expected_name, expected_namespace, expected_download_url +): + purl = remote_url_to_purl(url) + assert purl.type == "generic" + assert purl.name == expected_name + assert (purl.namespace or "") == expected_namespace + assert purl.qualifiers.get("download_url") == expected_download_url + assert "vcs_url" not in (purl.qualifiers or {}) + + +def test_archive_purl_with_version(): + url = "https://example.com/lib-1.0.tar.gz" + purl = remote_url_to_purl(url, version="sha256:" + "a" * 64) + assert purl.version == "sha256:" + "a" * 64 From 1b2d30af669cc2879c797605769fd7ecb9774082 Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 20 Mar 2026 19:55:17 +0000 Subject: [PATCH 11/35] Fixes --- .github/workflows/release.yml | 2 +- dfetch/manifest/manifest.py | 16 ++-- dfetch/manifest/project.py | 6 +- dfetch/project/__init__.py | 4 +- dfetch/project/archivesubproject.py | 11 ++- dfetch/project/subproject.py | 4 +- dfetch/reporting/sbom_reporter.py | 87 +++++++++++--------- dfetch/util/purl.py | 68 +++++++++------- dfetch/vcs/archive.py | 118 +++++++++++++++++++--------- tests/test_archive.py | 7 +- tests/test_integrity.py | 14 +++- 11 files changed, 215 insertions(+), 122 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dd9fc3fd..e1874f98 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -50,7 +50,7 @@ jobs: - name: Update latest tag if: ${{ steps.release_info.outputs.tag == 'latest' }} - uses: EndBug/latest-tag@fabb56bc8d15d5937c76719060da2226f5c3ffa8 + uses: EndBug/latest-tag@fabb56bc8d15d5937c76719060da2226f5c3ffa8 with: ref: latest description: Last state in main diff --git a/dfetch/manifest/manifest.py b/dfetch/manifest/manifest.py index 0f30e983..ec084c28 100644 --- a/dfetch/manifest/manifest.py +++ b/dfetch/manifest/manifest.py @@ -97,7 +97,9 @@ class ManifestDict(TypedDict, total=True): # pylint: disable=too-many-ancestors version: int | str remotes: NotRequired[Sequence[RemoteDict | Remote]] - projects: Sequence[ProjectEntryDict | ProjectEntry | dict[str, str | list[str]]] + projects: Sequence[ + ProjectEntryDict | ProjectEntry | dict[str, str | list[str] | dict[str, str]] + ] class Manifest: @@ -140,14 +142,16 @@ def __init__( def _init_projects( self, projects: Sequence[ - ProjectEntryDict | ProjectEntry | dict[str, str | list[str]] + ProjectEntryDict + | ProjectEntry + | dict[str, str | list[str] | dict[str, str]] ], ) -> dict[str, ProjectEntry]: """Iterate over projects from manifest and initialize ProjectEntries from it. Args: projects (Sequence[ - Union[ProjectEntryDict, ProjectEntry, Dict[str, Union[str, list[str]]]] + Union[ProjectEntryDict, ProjectEntry, Dict[str, Union[str, list[str], dict[str, str]]]] ]): Iterable with projects Raises: @@ -304,9 +308,11 @@ def _as_dict(self) -> dict[str, ManifestDict]: if len(remotes) == 1: remotes[0].pop("default", None) - projects: list[dict[str, str | list[str]]] = [] + projects: list[dict[str, str | list[str] | dict[str, str]]] = [] for project in self.projects: - project_yaml: dict[str, str | list[str]] = project.as_yaml() + project_yaml: dict[str, str | list[str] | dict[str, str]] = ( + project.as_yaml() + ) if len(remotes) == 1: project_yaml.pop("remote", None) projects.append(project_yaml) diff --git a/dfetch/manifest/project.py b/dfetch/manifest/project.py index d5c8dd36..10fe936d 100644 --- a/dfetch/manifest/project.py +++ b/dfetch/manifest/project.py @@ -370,7 +370,7 @@ def as_yaml(self) -> dict[str, str]: "repo-path": str, "vcs": str, "ignore": Sequence[str], - "integrity": dict, + "integrity": dict[str, str], "default_remote": str, }, total=False, @@ -398,7 +398,7 @@ def __init__(self, kwargs: ProjectEntryDict) -> None: self._tag: str = kwargs.get("tag", "") self._vcs: str = kwargs.get("vcs", "") self._ignore: Sequence[str] = kwargs.get("ignore", []) - integrity_data: dict = kwargs.get("integrity", {}) + integrity_data: dict[str, str] = kwargs.get("integrity", {}) self._integrity = Integrity(hash=integrity_data.get("hash", "")) if not self._remote and not self._url: @@ -407,7 +407,7 @@ def __init__(self, kwargs: ProjectEntryDict) -> None: @classmethod def from_yaml( cls, - yamldata: dict[str, str | list[str]] | ProjectEntryDict, + yamldata: dict[str, str | list[str] | dict[str, str]] | ProjectEntryDict, default_remote: str = "", ) -> "ProjectEntry": """Create a Project Entry from yaml data. diff --git a/dfetch/project/__init__.py b/dfetch/project/__init__.py index 96191220..b6bb3ecd 100644 --- a/dfetch/project/__init__.py +++ b/dfetch/project/__init__.py @@ -15,7 +15,9 @@ from dfetch.project.svnsuperproject import SvnSuperProject from dfetch.util.util import resolve_absolute_path -SUPPORTED_SUBPROJECT_TYPES = [ArchiveSubProject, GitSubProject, SvnSubProject] +SUPPORTED_SUBPROJECT_TYPES: list[ + type[ArchiveSubProject] | type[GitSubProject] | type[SvnSubProject] +] = [ArchiveSubProject, GitSubProject, SvnSubProject] SUPPORTED_SUPERPROJECT_TYPES = [GitSuperProject, SvnSuperProject] logger = get_logger(__name__) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 13b7cc06..a2cad4e0 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -44,17 +44,22 @@ import os import pathlib import tempfile +import urllib.request as _ur from dfetch.log import get_logger from dfetch.manifest.project import ProjectEntry from dfetch.manifest.version import Version from dfetch.project.subproject import SubProject +from dfetch.vcs.archive import ( + _safe_compare_hex, # private helper, intentionally imported for internal use +) +from dfetch.vcs.archive import ( + _suffix_for_url, # private helper, intentionally imported for internal use +) from dfetch.vcs.archive import ( SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, - _safe_compare_hex, # private helper, intentionally imported for internal use - _suffix_for_url, # private helper, intentionally imported for internal use compute_hash, is_archive_url, ) @@ -94,8 +99,6 @@ def revision_is_enough() -> bool: @staticmethod def list_tool_info() -> None: """Log information about the archive fetching tool (Python's urllib).""" - import urllib.request as _ur # noqa: PLC0415 - SubProject._log_tool("urllib", _ur.__doc__ or "built-in") def get_default_branch(self) -> str: diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index 0812d0bb..09a1299f 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -409,7 +409,9 @@ def freeze_project(self, project: ProjectEntry) -> str | None: return None if on_disk_version: project.version = on_disk_version - return on_disk_version.revision or on_disk_version.tag or str(on_disk_version) + return ( + on_disk_version.revision or on_disk_version.tag or str(on_disk_version) + ) return None @staticmethod diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 18dd295f..931d4346 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -107,13 +107,14 @@ from cyclonedx.model.license import LicenseAcknowledgement from cyclonedx.output import make_outputter from cyclonedx.schema import OutputFormat, SchemaVersion +from packageurl import PackageURL import dfetch.util.purl -from dfetch.util.purl import DFETCH_TO_CDX_HASH_ALGORITHM from dfetch.manifest.manifest import Manifest from dfetch.manifest.project import ProjectEntry from dfetch.reporting.reporter import Reporter from dfetch.util.license import License +from dfetch.util.purl import DFETCH_TO_CDX_HASH_ALGORITHM # PyRight is pedantic with decorators see https://github.com/madpah/serializable/issues/8 # It might be fixable with https://github.com/microsoft/pyright/discussions/4426, would prefer @@ -190,11 +191,8 @@ def add_project( purl = dfetch.util.purl.remote_url_to_purl( project.remote_url, version=version, subpath=project.source or None ) - name = project.name if purl.type == "generic" else purl.name - location = self.manifest.find_name_in_manifest(project.name) - component = Component( name=name, version=version, @@ -250,7 +248,15 @@ def add_project( ], ), ) + self._apply_external_references(component, purl, version) + self._apply_licenses(component, licenses) + self._bom.components.add(component) + @staticmethod + def _apply_external_references( + component: Component, purl: PackageURL, version: str + ) -> None: + """Attach external references to *component* based on its PURL type.""" if purl.type == "github": component.external_references.add( ExternalReference( @@ -266,53 +272,62 @@ def add_project( ) ) elif purl.qualifiers.get("download_url"): - # Archive dependency: add a DISTRIBUTION external reference and, - # when the version encodes a cryptographic hash, record it on the component. - download_url = purl.qualifiers["download_url"] - component.group = purl.namespace or None # type: ignore[assignment] + SbomReporter._apply_archive_refs(component, purl, version) + else: + SbomReporter._apply_vcs_refs(component, purl) + + @staticmethod + def _apply_archive_refs( + component: Component, purl: PackageURL, version: str + ) -> None: + """Add DISTRIBUTION reference and optional hash for an archive dependency.""" + download_url = purl.qualifiers["download_url"] + component.group = purl.namespace or None # type: ignore[assignment] + component.external_references.add( + ExternalReference( + type=ExternalReferenceType.DISTRIBUTION, + url=XsUri(download_url), + ) + ) + if version and ":" in version: + algo_prefix, hex_value = version.split(":", 1) + cdx_algo_name = DFETCH_TO_CDX_HASH_ALGORITHM.get(algo_prefix) + if cdx_algo_name: + component.hashes.add( + HashType( + alg=HashAlgorithm(cdx_algo_name), + content=hex_value, + ) + ) + + @staticmethod + def _apply_vcs_refs(component: Component, purl: PackageURL) -> None: + """Add VCS external reference and group for a generic VCS dependency.""" + component.group = purl.namespace + vcs_url = purl.qualifiers.get("vcs_url", "") + # ExternalReferenceType.VCS does not support ssh:// urls + if vcs_url and "ssh://" not in vcs_url: component.external_references.add( ExternalReference( - type=ExternalReferenceType.DISTRIBUTION, - url=XsUri(download_url), + type=ExternalReferenceType.VCS, + url=XsUri(vcs_url), ) ) - if version and ":" in version: - algo_prefix, hex_value = version.split(":", 1) - cdx_algo_name = DFETCH_TO_CDX_HASH_ALGORITHM.get(algo_prefix) - if cdx_algo_name: - component.hashes.add( - HashType( - alg=HashAlgorithm(cdx_algo_name), - content=hex_value, - ) - ) - else: - component.group = purl.namespace - - vcs_url = purl.qualifiers.get("vcs_url", "") - # ExternalReferenceType.VCS does not support ssh:// urls - if vcs_url and "ssh://" not in vcs_url: - component.external_references.add( - ExternalReference( - type=ExternalReferenceType.VCS, - url=XsUri(vcs_url), - ) - ) + @staticmethod + def _apply_licenses(component: Component, licenses: list[License]) -> None: + """Attach *licenses* to *component* and its evidence block.""" for lic in licenses: - # License wants either an SPDX id or a name, prefer SPDX id when available + # Prefer SPDX id when available cdx_license = ( CycloneDxLicense(id=lic.spdx_id) if lic.spdx_id else CycloneDxLicense(name=lic.name) ) - component.licenses.add(cdx_license) if component.evidence: component.evidence.licenses.add(cdx_license) - self._bom.components.add(component) - def dump_to_file(self, outfile: str) -> bool: """Dump the SBoM to file.""" output_format = OutputFormat( diff --git a/dfetch/util/purl.py b/dfetch/util/purl.py index deba1272..f2b528c7 100644 --- a/dfetch/util/purl.py +++ b/dfetch/util/purl.py @@ -101,59 +101,67 @@ def _known_purl_types( return None -def remote_url_to_purl( - remote_url: str, version: str | None = None, subpath: str | None = None +def _archive_purl( + remote_url: str, version: str | None, subpath: str | None ) -> PackageURL: - """Convert a remote URL to a valid PackageURL object. + """Build a generic PURL for an archive URL.""" + parsed = urlparse(remote_url) + basename = os.path.basename(parsed.path) + name = _strip_archive_extension(basename) or DEFAULT_NAME + namespace = parsed.hostname or "" + return PackageURL( + type="generic", + namespace=namespace or None, + name=name, + version=version, + qualifiers={"download_url": remote_url}, + subpath=subpath, + ) - Supports GitHub, Bitbucket, SVN, SSH paths, and archive downloads. - Optionally specify version and subpath. - """ - purl = _known_purl_types(remote_url, version, subpath) - if purl: - return purl - # Archive URLs (tar.gz, zip, …) get a generic PURL with a download_url qualifier. - # The name is derived from the archive filename (extension stripped); the - # namespace is the hostname (empty for file:// URLs). - if _is_archive_url(remote_url): - parsed = urlparse(remote_url) - basename = os.path.basename(parsed.path) - name = _strip_archive_extension(basename) or DEFAULT_NAME - namespace = parsed.hostname or "" - return PackageURL( - type="generic", - namespace=namespace or None, - name=name, - version=version, - qualifiers={"download_url": remote_url}, - subpath=subpath, - ) +def _vcs_namespace_and_name(remote_url: str) -> tuple[str, str, str]: + """Derive namespace, name, and normalised URL for a generic VCS remote URL. + Returns: + A ``(namespace, name, remote_url)`` tuple where *remote_url* may have + been normalised (e.g. SSH short-form converted to ``ssh://`` scheme). + """ parsed = urlparse(remote_url) path = parsed.path.lstrip("/") - if "svn" in parsed.scheme or "svn." in parsed.netloc: namespace, name = _namespace_and_name_from_domain_and_path(parsed.netloc, path) if namespace.startswith("p/"): namespace = namespace[len("p/") :] namespace = namespace.replace("/svn/", "/") - else: match = SSH_REGEX.match(remote_url) if match: namespace, name = _namespace_and_name_from_domain_and_path( - match.group("host"), - match.group("path"), + match.group("host"), match.group("path") ) - if not parsed.scheme: remote_url = f"ssh://{parsed.path.replace(':', '/')}" else: namespace, name = _namespace_and_name_from_domain_and_path( remote_url, path.replace(".git", "") ) + return namespace, name, remote_url + +def remote_url_to_purl( + remote_url: str, version: str | None = None, subpath: str | None = None +) -> PackageURL: + """Convert a remote URL to a valid PackageURL object. + + Supports GitHub, Bitbucket, SVN, SSH paths, and archive downloads. + Optionally specify version and subpath. + """ + purl = _known_purl_types(remote_url, version, subpath) + if purl: + return purl + if _is_archive_url(remote_url): + return _archive_purl(remote_url, version, subpath) + namespace, name, remote_url = _vcs_namespace_and_name(remote_url) return PackageURL( type="generic", namespace=namespace, diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index f3d1677c..cc011d4b 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -24,30 +24,27 @@ import hashlib import hmac +import http.client import os import pathlib import shutil import sys import tarfile import tempfile -import urllib.error -import urllib.request +import urllib.parse import zipfile from collections.abc import Sequence +from dfetch.log import get_logger +from dfetch.project.subproject import SubProject +from dfetch.util.util import find_matching_files, safe_rm + #: Archive file extensions recognised by DFetch. -#: Defined before any intra-package imports to avoid partial-initialisation -#: issues when other modules (e.g. dfetch.util.purl) import this symbol while -#: the module is still being initialised. ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") #: Hash algorithms supported by the ``integrity.hash`` manifest field. SUPPORTED_HASH_ALGORITHMS = ("sha256",) -from dfetch.log import get_logger # noqa: E402 -from dfetch.project.subproject import SubProject # noqa: E402 -from dfetch.util.util import find_matching_files, safe_rm # noqa: E402 - logger = get_logger(__name__) # Safety limits applied during extraction to prevent decompression bombs. @@ -55,6 +52,19 @@ _MAX_MEMBER_COUNT = 10_000 +def _http_conn(scheme: str, netloc: str, timeout: int) -> http.client.HTTPConnection: + """Return an :class:`http.client.HTTPConnection` or HTTPS variant for *netloc*.""" + if scheme == "https": + return http.client.HTTPSConnection(netloc, timeout=timeout) + return http.client.HTTPConnection(netloc, timeout=timeout) + + +def _resource_path(parsed: urllib.parse.ParseResult) -> str: + """Return the path + query portion of *parsed* suitable for HTTP requests.""" + path = parsed.path or "/" + return f"{path}?{parsed.query}" if parsed.query else path + + def is_archive_url(url: str) -> bool: """Return *True* when *url* ends with a recognised archive extension.""" return any(url.lower().endswith(ext) for ext in ARCHIVE_EXTENSIONS) @@ -112,26 +122,29 @@ def is_accessible(self) -> bool: * ``http``/``https`` URLs first try a ``HEAD`` request. If the server rejects it (405/501) a partial ``GET`` (``Range: bytes=0-0``) is attempted instead. Returns *False* on any final failure. + * Any other URL scheme returns *False*. """ - from urllib.parse import urlparse as _urlparse # noqa: PLC0415 - - if _urlparse(self.url).scheme == "file": - path = _urlparse(self.url).path - return os.path.exists(path) - - for method, headers in [ - ("HEAD", {}), - ("GET", {"Range": "bytes=0-0"}), - ]: + parsed = urllib.parse.urlparse(self.url) + if parsed.scheme == "file": + return os.path.exists(parsed.path) + if parsed.scheme not in ("http", "https"): + return False + return self._is_http_reachable(parsed) + + def _is_http_reachable(self, parsed: urllib.parse.ParseResult) -> bool: + """Try HEAD then partial-GET to confirm an HTTP/HTTPS URL is reachable.""" + netloc, path = parsed.netloc, _resource_path(parsed) + for method, headers in [("HEAD", {}), ("GET", {"Range": "bytes=0-0"})]: try: - req = urllib.request.Request(self.url, method=method, headers=headers) - with urllib.request.urlopen(req, timeout=15): - return True - except urllib.error.HTTPError as exc: - if exc.code in (405, 501): # Method Not Allowed / Not Implemented - continue - return False - except (urllib.error.URLError, OSError, ValueError): + conn = _http_conn(parsed.scheme, netloc, timeout=15) + try: + conn.request(method, path, headers=headers) + status = conn.getresponse().status + if status not in (405, 501): + return status < 400 + finally: + conn.close() + except (OSError, ValueError, http.client.HTTPException): return False return False @@ -142,14 +155,40 @@ def download(self, dest_path: str) -> None: dest_path: Local file path to write the archive to. Raises: - RuntimeError: On download failure. + RuntimeError: On download failure or unsupported URL scheme. """ + parsed = urllib.parse.urlparse(self.url) + if parsed.scheme == "file": + try: + shutil.copy(parsed.path, dest_path) + except OSError as exc: + raise RuntimeError( + f"'{self.url}' is not a valid URL or unreachable: {exc}" + ) from exc + elif parsed.scheme in ("http", "https"): + self._http_download(parsed, dest_path) + else: + raise RuntimeError( + f"'{self.url}' uses unsupported scheme '{parsed.scheme}'." + ) + + def _http_download(self, parsed: urllib.parse.ParseResult, dest_path: str) -> None: + """Download an HTTP/HTTPS resource to *dest_path*.""" + conn = _http_conn(parsed.scheme, parsed.netloc, timeout=60) try: - urllib.request.urlretrieve(self.url, dest_path) - except (urllib.error.URLError, OSError) as exc: + conn.request("GET", _resource_path(parsed)) + resp = conn.getresponse() + if resp.status != 200: + raise RuntimeError(f"HTTP {resp.status} when downloading '{self.url}'") + with open(dest_path, "wb") as fh: + while chunk := resp.read(65536): + fh.write(chunk) + except (OSError, http.client.HTTPException) as exc: raise RuntimeError( f"'{self.url}' is not a valid URL or unreachable: {exc}" ) from exc + finally: + conn.close() class ArchiveLocalRepo: @@ -223,9 +262,13 @@ def _check_archive_limits(member_count: int, total_bytes: int) -> None: ) @staticmethod - def _check_zip_members(zf: zipfile.ZipFile) -> None: + def check_zip_members(zf: zipfile.ZipFile) -> list[zipfile.ZipInfo]: """Validate all ZIP member paths against path-traversal attacks. + Returns: + The validated list of members, safe to pass to + :meth:`zipfile.ZipFile.extract`. + Raises: RuntimeError: When any member contains an absolute path, a ``..`` component, or when the archive exceeds the size/count limits. @@ -242,12 +285,13 @@ def _check_zip_members(zf: zipfile.ZipFile) -> None: raise RuntimeError( f"Archive contains an unsafe member path: {info.filename!r}" ) + return members @staticmethod def _check_tar_members(tf: tarfile.TarFile) -> None: """Validate TAR members against decompression bombs and path traversal. - Size/count limits mirror :meth:`_check_zip_members`. Path validation + Size/count limits mirror :meth:`check_zip_members`. Path validation is defence-in-depth: on Python ≥ 3.11.4 the ``filter="tar"`` passed to :meth:`tarfile.TarFile.extractall` also rejects unsafe paths, but we check here too so the guard applies on all supported Python versions. @@ -289,13 +333,13 @@ def _extract_raw(archive_path: str, dest_dir: str) -> None: with tarfile.open(archive_path, "r:*") as tf: ArchiveLocalRepo._check_tar_members(tf) if sys.version_info >= (3, 11, 4): - tf.extractall(dest_dir, filter="tar") + tf.extractall(dest_dir, filter="tar") # nosec B202 else: - tf.extractall(dest_dir) # noqa: S202 + tf.extractall(dest_dir) # nosec B202 elif lower.endswith(".zip") or zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as zf: - ArchiveLocalRepo._check_zip_members(zf) - zf.extractall(dest_dir) + ArchiveLocalRepo.check_zip_members(zf) + zf.extractall(dest_dir) # nosec B202 else: raise RuntimeError( f"Unsupported archive format: '{archive_path}'. " @@ -319,6 +363,8 @@ def _copy_with_src( shutil.copy2(s, d) elif os.path.isfile(src_path): shutil.copy2(src_path, os.path.join(dest_dir, os.path.basename(src_path))) + else: + raise RuntimeError(f"src {src!r} was not found in archive") if keep_licenses: for item in os.listdir(extract_root): diff --git a/tests/test_archive.py b/tests/test_archive.py index 05389e77..0b161b91 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -22,7 +22,7 @@ # These are static methods on ArchiveLocalRepo _check_archive_limits = ArchiveLocalRepo._check_archive_limits -_check_zip_members = ArchiveLocalRepo._check_zip_members +_check_zip_members = ArchiveLocalRepo.check_zip_members _check_tar_members = ArchiveLocalRepo._check_tar_members @@ -37,7 +37,10 @@ def test_compute_hash_empty_file(): try: digest = compute_hash(path, "sha256") # SHA-256 of empty string - assert digest == "e3b0c44298fc1c149afbf4c8996fb924" "27ae41e4649b934ca495991b7852b855" + assert ( + digest == "e3b0c44298fc1c149afbf4c8996fb924" + "27ae41e4649b934ca495991b7852b855" + ) finally: os.remove(path) diff --git a/tests/test_integrity.py b/tests/test_integrity.py index 72398190..0518a094 100644 --- a/tests/test_integrity.py +++ b/tests/test_integrity.py @@ -2,7 +2,6 @@ from dfetch.manifest.project import Integrity, ProjectEntry - # --------------------------------------------------------------------------- # Integrity dataclass # --------------------------------------------------------------------------- @@ -54,7 +53,14 @@ def test_projectentry_integrity_falsy_without_hash(): def test_projectentry_as_yaml_includes_integrity(): h = "sha256:" + "d" * 64 - project = ProjectEntry({"name": "lib", "url": "https://example.com/lib.tar.gz", "vcs": "archive", "integrity": {"hash": h}}) + project = ProjectEntry( + { + "name": "lib", + "url": "https://example.com/lib.tar.gz", + "vcs": "archive", + "integrity": {"hash": h}, + } + ) yaml_data = project.as_yaml() assert yaml_data["integrity"] == {"hash": h} @@ -66,7 +72,9 @@ def test_projectentry_as_yaml_omits_empty_integrity(): def test_projectentry_hash_setter(): - project = ProjectEntry({"name": "lib", "url": "https://example.com/lib.tar.gz", "vcs": "archive"}) + project = ProjectEntry( + {"name": "lib", "url": "https://example.com/lib.tar.gz", "vcs": "archive"} + ) h = "sha256:" + "e" * 64 project.hash = h assert project.hash == h From 72aacb7dec86782721bd852ef856c9adbe30c05a Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 20 Mar 2026 22:41:32 +0000 Subject: [PATCH 12/35] Cleanup implementation --- dfetch/commands/report.py | 4 +- dfetch/manifest/project.py | 4 +- dfetch/project/archivesubproject.py | 91 +++++++++++++---------------- dfetch/project/gitsubproject.py | 6 +- dfetch/project/subproject.py | 10 ---- dfetch/project/svnsubproject.py | 7 +-- dfetch/util/util.py | 66 +++++++++++++++++++++ dfetch/vcs/archive.py | 90 +++++----------------------- features/steps/generic_steps.py | 2 +- tests/test_archive.py | 8 +-- tests/test_patch.py | 2 +- 11 files changed, 135 insertions(+), 155 deletions(-) diff --git a/dfetch/commands/report.py b/dfetch/commands/report.py index 2b20b4f5..261f89c3 100644 --- a/dfetch/commands/report.py +++ b/dfetch/commands/report.py @@ -14,9 +14,9 @@ from dfetch.manifest.project import ProjectEntry from dfetch.project import create_super_project from dfetch.project.metadata import Metadata -from dfetch.project.subproject import SubProject from dfetch.reporting import REPORTERS, ReportTypes from dfetch.util.license import License, guess_license_in_file +from dfetch.util.util import is_license_file logger = get_logger(__name__) @@ -89,7 +89,7 @@ def _determine_licenses(project: ProjectEntry) -> list[License]: license_files = [] with dfetch.util.util.in_directory(project.destination): - for license_file in filter(SubProject.is_license_file, glob.glob("*")): + for license_file in filter(is_license_file, glob.glob("*")): logger.debug(f"Found license file {license_file} for {project.name}") guessed_license = guess_license_in_file(license_file) diff --git a/dfetch/manifest/project.py b/dfetch/manifest/project.py index 10fe936d..ed5e8202 100644 --- a/dfetch/manifest/project.py +++ b/dfetch/manifest/project.py @@ -336,8 +336,8 @@ class Integrity: Holds the ``hash:`` sub-field today and is designed to accommodate future signature-verification fields: - * ``sig`` – URL of a detached signature file (``.sig`` / ``.asc``). - * ``sig_key`` – URL or fingerprint of the signing key (``.p7s`` / ``.gpg``). + * ``sig`` - URL of a detached signature file (``.sig`` / ``.asc``). + * ``sig_key`` - URL or fingerprint of the signing key (``.p7s`` / ``.gpg``). """ hash: str = field(default="") diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index a2cad4e0..2f928c91 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -2,15 +2,15 @@ Archives are a third VCS type alongside ``git`` and ``svn``. They represent versioned dependencies that are distributed as ``.tar.gz``, ``.tgz``, -``.tar.bz2``, ``.tar.xz`` or ``.zip`` files reachable via any URL that Python's -:mod:`urllib.request` understands (``http://``, ``https://``, ``file://``, …). +``.tar.bz2``, ``.tar.xz`` or ``.zip`` files reachable via ``http://``, +``https://``, or ``file://`` URLs. Unlike git and SVN, archives have no inherent "branching" or "tagging" concept. Version identity is expressed through: -* **No hash** – the URL itself acts as the identity. The archive is +* **No hash** - the URL itself acts as the identity. The archive is considered up-to-date as long as the same URL is still reachable. -* **``integrity.hash: :``** – the cryptographic hash of the +* **``integrity.hash: :``** - the cryptographic hash of the archive file acts as the version identifier. The fetch step verifies the downloaded archive against this hash and raises an error on mismatch. @@ -41,22 +41,18 @@ from __future__ import annotations +import hmac +import http.client import os import pathlib import tempfile -import urllib.request as _ur from dfetch.log import get_logger from dfetch.manifest.project import ProjectEntry from dfetch.manifest.version import Version from dfetch.project.subproject import SubProject from dfetch.vcs.archive import ( - _safe_compare_hex, # private helper, intentionally imported for internal use -) -from dfetch.vcs.archive import ( - _suffix_for_url, # private helper, intentionally imported for internal use -) -from dfetch.vcs.archive import ( + ARCHIVE_EXTENSIONS, SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, @@ -67,6 +63,24 @@ logger = get_logger(__name__) +def _safe_compare_hex(actual: str, expected: str) -> bool: + """Constant-time comparison of two hex digest strings. + + Uses :func:`hmac.compare_digest` to avoid leaking timing information about + the expected hash value. + """ + return hmac.compare_digest(actual.lower(), expected.lower()) + + +def _suffix_for_url(url: str) -> str: + """Return the archive file suffix for *url* (e.g. ``'.tar.gz'``, ``'.zip'``).""" + lower = url.lower() + for ext in sorted(ARCHIVE_EXTENSIONS, key=len, reverse=True): + if lower.endswith(ext): + return ext + return ".archive" + + class ArchiveSubProject(SubProject): """A project fetched from a tar/zip archive URL. @@ -83,10 +97,6 @@ def __init__(self, project: ProjectEntry) -> None: self._project_entry = project self._remote_repo = ArchiveRemote(project.remote_url) - # ------------------------------------------------------------------ - # SubProject abstract interface - # ------------------------------------------------------------------ - def check(self) -> bool: """Return *True* when the project URL looks like an archive.""" return is_archive_url(self.remote) @@ -98,8 +108,8 @@ def revision_is_enough() -> bool: @staticmethod def list_tool_info() -> None: - """Log information about the archive fetching tool (Python's urllib).""" - SubProject._log_tool("urllib", _ur.__doc__ or "built-in") + """Log information about the archive fetching tool (Python's http.client).""" + SubProject._log_tool("http.client", http.client.__doc__ or "built-in") def get_default_branch(self) -> str: """Archives have no branches; return an empty string.""" @@ -107,7 +117,7 @@ def get_default_branch(self) -> str: def _latest_revision_on_branch(self, branch: str) -> str: # noqa: ARG002 """For archives the 'latest revision' is always the URL (or hash).""" - return self._project_entry.remote_url + return self.remote def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: """Download the archive to a temporary file and return its hash. @@ -117,20 +127,16 @@ def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: Raises: RuntimeError: On download failure or unsupported algorithm. """ - tmp_path: str | None = None + fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) + os.close(fd) try: - with tempfile.NamedTemporaryFile( - suffix=_suffix_for_url(self._project_entry.remote_url), delete=False - ) as tmp: - tmp_path = tmp.name self._remote_repo.download(tmp_path) return compute_hash(tmp_path, algorithm) finally: - if tmp_path: - try: - os.remove(tmp_path) - except OSError: - pass + try: + os.remove(tmp_path) + except OSError: + pass def _does_revision_exist(self, revision: str) -> bool: """Check whether *revision* (a hash or URL string) is still valid. @@ -151,17 +157,13 @@ def _does_revision_exist(self, revision: str) -> bool: except RuntimeError: return False - # revision is the URL – just check accessibility + # revision is the URL - just check accessibility return self._remote_repo.is_accessible() def _list_of_tags(self) -> list[str]: """Archives have no tags; returns an empty list.""" return [] - # ------------------------------------------------------------------ - # Version overrides - # ------------------------------------------------------------------ - @property def wanted_version(self) -> Version: """Version derived from the ``integrity.hash`` field or the archive URL. @@ -174,11 +176,7 @@ def wanted_version(self) -> Version: """ if self._project_entry.hash: return Version(revision=self._project_entry.hash) - return Version(revision=self._project_entry.remote_url) - - # ------------------------------------------------------------------ - # Fetch - # ------------------------------------------------------------------ + return Version(revision=self.remote) def _fetch_impl(self, version: Version) -> Version: """Download and extract the archive to the local destination. @@ -193,15 +191,12 @@ def _fetch_impl(self, version: Version) -> Version: Returns: The version that was actually fetched (hash string or URL). """ - url = self._project_entry.remote_url expected_hash = self._project_entry.hash pathlib.Path(self.local_path).mkdir(parents=True, exist_ok=True) - suffix = _suffix_for_url(url) - with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: - tmp_path = tmp.name - + fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) + os.close(fd) try: self._remote_repo.download(tmp_path) @@ -231,13 +226,7 @@ def _fetch_impl(self, version: Version) -> Version: except OSError: pass - if expected_hash: - return Version(revision=expected_hash) - return Version(revision=url) - - # ------------------------------------------------------------------ - # Freeze support - # ------------------------------------------------------------------ + return Version(revision=expected_hash if expected_hash else self.remote) def freeze_project(self, project: ProjectEntry) -> str | None: """Pin *project* to a cryptographic hash of the archive. @@ -264,7 +253,7 @@ def freeze_project(self, project: ProjectEntry) -> str | None: revision = on_disk.revision - # Already hash-pinned – revision is "sha256:" + # Already hash-pinned - revision is "sha256:" if revision.startswith(tuple(f"{a}:" for a in SUPPORTED_HASH_ALGORITHMS)): if project.hash == revision: return None diff --git a/dfetch/project/gitsubproject.py b/dfetch/project/gitsubproject.py index c52f3208..db98ba3d 100644 --- a/dfetch/project/gitsubproject.py +++ b/dfetch/project/gitsubproject.py @@ -8,7 +8,7 @@ from dfetch.manifest.project import ProjectEntry from dfetch.manifest.version import Version from dfetch.project.subproject import SubProject -from dfetch.util.util import safe_rmtree +from dfetch.util.util import LICENSE_GLOBS, safe_rmtree from dfetch.vcs.git import GitLocalRepo, GitRemote, get_git_version logger = get_logger(__name__) @@ -64,8 +64,8 @@ def _fetch_impl(self, version: Version) -> Version: # When exporting a file, the destination directory must already exist pathlib.Path(self.local_path).mkdir(parents=True, exist_ok=True) - license_globs = [f"/{name.lower()}" for name in self.LICENSE_GLOBS] + [ - f"/{name.upper()}" for name in self.LICENSE_GLOBS + license_globs = [f"/{name.lower()}" for name in LICENSE_GLOBS] + [ + f"/{name.upper()}" for name in LICENSE_GLOBS ] local_repo = GitLocalRepo(self.local_path) diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index 09a1299f..7eb1622f 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -1,6 +1,5 @@ """SubProject.""" -import fnmatch import os import pathlib from abc import ABC, abstractmethod @@ -26,7 +25,6 @@ class SubProject(ABC): """ NAME = "" - LICENSE_GLOBS = ["licen[cs]e*", "copying*", "copyright*"] def __init__(self, project: ProjectEntry) -> None: """Create the subproject.""" @@ -413,11 +411,3 @@ def freeze_project(self, project: ProjectEntry) -> str | None: on_disk_version.revision or on_disk_version.tag or str(on_disk_version) ) return None - - @staticmethod - def is_license_file(filename: str) -> bool: - """Check if the given filename is a license file.""" - return any( - fnmatch.fnmatch(filename.lower(), pattern) - for pattern in SubProject.LICENSE_GLOBS - ) diff --git a/dfetch/project/svnsubproject.py b/dfetch/project/svnsubproject.py index 6284daaf..85c28c22 100644 --- a/dfetch/project/svnsubproject.py +++ b/dfetch/project/svnsubproject.py @@ -11,6 +11,7 @@ from dfetch.util.util import ( find_matching_files, find_non_matching_files, + is_license_file, safe_rm, ) from dfetch.vcs.svn import SvnRemote, SvnRepo, get_svn_version @@ -103,7 +104,7 @@ def _determine_what_to_fetch(self, version: Version) -> tuple[str, str, str]: def _remove_ignored_files(self) -> None: """Remove any ignored files, whilst keeping license files.""" for file_or_dir in find_matching_files(self.local_path, self.ignore): - if not (file_or_dir.is_file() and self.is_license_file(file_or_dir.name)): + if not (file_or_dir.is_file() and is_license_file(file_or_dir.name)): safe_rm(file_or_dir) def _fetch_impl(self, version: Version) -> Version: @@ -168,9 +169,7 @@ def _get_info(self, branch: str) -> dict[str, str]: def _license_files(url_path: str) -> list[str]: return [ str(license) - for license in filter( - SvnSubProject.is_license_file, SvnRepo.files_in_path(url_path) - ) + for license in filter(is_license_file, SvnRepo.files_in_path(url_path)) ] def _get_revision(self, branch: str) -> str: diff --git a/dfetch/util/util.py b/dfetch/util/util.py index b5f83b20..0f265dd3 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -12,6 +12,72 @@ from _hashlib import HASH +#: Glob patterns used to identify license files by filename. +LICENSE_GLOBS = ["licen[cs]e*", "copying*", "copyright*"] + + +def is_license_file(filename: str) -> bool: + """Return *True* when *filename* matches a known license file pattern.""" + return any(fnmatch.fnmatch(filename.lower(), pattern) for pattern in LICENSE_GLOBS) + + +def _copy_entry(src_entry: str, dest_entry: str) -> None: + """Copy a single file or directory *src_entry* to *dest_entry*.""" + if os.path.isdir(src_entry): + shutil.copytree(src_entry, dest_entry) + else: + shutil.copy2(src_entry, dest_entry) + + +def copy_directory_contents(src_dir: str, dest_dir: str) -> None: + """Copy every entry in *src_dir* directly into *dest_dir*. + + Directories are copied recursively; files are copied with metadata. + """ + for entry_name in os.listdir(src_dir): + _copy_entry( + os.path.join(src_dir, entry_name), + os.path.join(dest_dir, entry_name), + ) + + +def copy_src_subset( + src_root: str, dest_dir: str, src: str, keep_licenses: bool +) -> None: + """Copy a *src* sub-path from *src_root* into *dest_dir*. + + When *src* is a directory, its contents are copied flat into *dest_dir*. + When *src* is a single file, that file is copied into *dest_dir*. + If *keep_licenses* is ``True``, any license files found directly in + *src_root* are also copied regardless of the *src* filter. + + Raises: + RuntimeError: When *src* does not exist inside *src_root*. + """ + src_path = os.path.join(src_root, src) + if os.path.isdir(src_path): + copy_directory_contents(src_path, dest_dir) + elif os.path.isfile(src_path): + shutil.copy2(src_path, os.path.join(dest_dir, os.path.basename(src_path))) + else: + raise RuntimeError(f"src {src!r} was not found in the extracted archive") + + if keep_licenses: + for entry_name in os.listdir(src_root): + full_path = os.path.join(src_root, entry_name) + if os.path.isfile(full_path) and is_license_file(entry_name): + shutil.copy2(full_path, os.path.join(dest_dir, entry_name)) + + +def prune_files_by_pattern(directory: str, patterns: Sequence[str]) -> None: + """Remove files and directories in *directory* matching *patterns*. + + License files are never removed even when they match a pattern. + """ + for file_or_dir in find_matching_files(directory, patterns): + if not (file_or_dir.is_file() and is_license_file(file_or_dir.name)): + safe_rm(file_or_dir) + def _remove_readonly(func: Any, path: str, _: Any) -> None: if not os.access(path, os.W_OK): diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index cc011d4b..4f9a631a 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -23,7 +23,6 @@ from __future__ import annotations import hashlib -import hmac import http.client import os import pathlib @@ -36,8 +35,11 @@ from collections.abc import Sequence from dfetch.log import get_logger -from dfetch.project.subproject import SubProject -from dfetch.util.util import find_matching_files, safe_rm +from dfetch.util.util import ( + copy_directory_contents, + copy_src_subset, + prune_files_by_pattern, +) #: Archive file extensions recognised by DFetch. ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") @@ -95,15 +97,6 @@ def compute_hash(path: str, algorithm: str = "sha256") -> str: return h.hexdigest() -def _safe_compare_hex(actual: str, expected: str) -> bool: - """Constant-time comparison of two hex digest strings. - - Uses :func:`hmac.compare_digest` to avoid leaking information about the - expected hash value via timing side-channels. - """ - return hmac.compare_digest(actual.lower(), expected.lower()) - - class ArchiveRemote: """Represents a remote archive (tar/zip) URL. @@ -197,7 +190,7 @@ class ArchiveLocalRepo: Supports ``.tar.gz``, ``.tgz``, ``.tar.bz2``, ``.tar.xz`` and ``.zip`` archives. A single top-level directory in the archive is automatically stripped (like ``tar --strip-components=1``), so the archive may be - structured as ``project-1.0/src/…`` or ``src/…`` – both work. + structured as ``project-1.0/src/…`` or ``src/…`` - both work. """ @staticmethod @@ -224,23 +217,23 @@ def extract( ArchiveLocalRepo._extract_raw(archive_path, tmp_dir) # Strip a single top-level directory if the archive uses one - entries = os.listdir(tmp_dir) - if len(entries) == 1 and os.path.isdir(os.path.join(tmp_dir, entries[0])): - extract_root = os.path.join(tmp_dir, entries[0]) + top_entries = os.listdir(tmp_dir) + if len(top_entries) == 1 and os.path.isdir( + os.path.join(tmp_dir, top_entries[0]) + ): + extract_root = os.path.join(tmp_dir, top_entries[0]) else: extract_root = tmp_dir pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True) if src: - ArchiveLocalRepo._copy_with_src( - extract_root, dest_dir, src.rstrip("/"), is_license - ) + copy_src_subset(extract_root, dest_dir, src.rstrip("/"), is_license) else: - ArchiveLocalRepo._copy_all(extract_root, dest_dir) + copy_directory_contents(extract_root, dest_dir) if ignore: - ArchiveLocalRepo._apply_ignore(dest_dir, ignore) + prune_files_by_pattern(dest_dir, ignore) @staticmethod def _check_archive_limits(member_count: int, total_bytes: int) -> None: @@ -345,58 +338,3 @@ def _extract_raw(archive_path: str, dest_dir: str) -> None: f"Unsupported archive format: '{archive_path}'. " f"Supported formats: {', '.join(ARCHIVE_EXTENSIONS)}" ) - - @staticmethod - def _copy_with_src( - extract_root: str, dest_dir: str, src: str, keep_licenses: bool - ) -> None: - """Copy only *src* sub-directory contents (and optionally licenses) to *dest_dir*.""" - src_path = os.path.join(extract_root, src) - - if os.path.isdir(src_path): - for item in os.listdir(src_path): - s = os.path.join(src_path, item) - d = os.path.join(dest_dir, item) - if os.path.isdir(s): - shutil.copytree(s, d) - else: - shutil.copy2(s, d) - elif os.path.isfile(src_path): - shutil.copy2(src_path, os.path.join(dest_dir, os.path.basename(src_path))) - else: - raise RuntimeError(f"src {src!r} was not found in archive") - - if keep_licenses: - for item in os.listdir(extract_root): - full = os.path.join(extract_root, item) - if os.path.isfile(full) and SubProject.is_license_file(item): - shutil.copy2(full, os.path.join(dest_dir, item)) - - @staticmethod - def _copy_all(extract_root: str, dest_dir: str) -> None: - """Copy all contents of *extract_root* into *dest_dir*.""" - for item in os.listdir(extract_root): - s = os.path.join(extract_root, item) - d = os.path.join(dest_dir, item) - if os.path.isdir(s): - shutil.copytree(s, d) - else: - shutil.copy2(s, d) - - @staticmethod - def _apply_ignore(dest_dir: str, ignore: Sequence[str]) -> None: - """Remove files/directories matching *ignore* patterns from *dest_dir*.""" - for file_or_dir in find_matching_files(dest_dir, ignore): - if not ( - file_or_dir.is_file() and SubProject.is_license_file(file_or_dir.name) - ): - safe_rm(file_or_dir) - - -def _suffix_for_url(url: str) -> str: - """Return the archive file suffix for a URL (e.g. '.tar.gz', '.zip').""" - lower = url.lower() - for ext in sorted(ARCHIVE_EXTENSIONS, key=len, reverse=True): - if lower.endswith(ext): - return ext - return ".archive" diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index ed06cb10..9c56a241 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -381,7 +381,7 @@ def step_impl(context, name): @then("the '{name}' json file includes") def step_impl(context, name): - """Partial JSON match – the expected JSON must be a *subset* of the actual file.""" + """Partial JSON match - the expected JSON must be a *subset* of the actual file.""" check_json_subset(name, context.text, context) diff --git a/tests/test_archive.py b/tests/test_archive.py index 0b161b91..421b0367 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -1,4 +1,4 @@ -"""Unit tests for dfetch.vcs.archive.""" +"""Unit tests for dfetch.vcs.archive and dfetch.project.archivesubproject.""" import io import os @@ -8,14 +8,12 @@ import pytest -import dfetch.project # noqa: F401 – must be imported before dfetch.vcs.archive to break circular init +from dfetch.project.archivesubproject import _safe_compare_hex, _suffix_for_url from dfetch.vcs.archive import ( ARCHIVE_EXTENSIONS, SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, - _safe_compare_hex, - _suffix_for_url, compute_hash, is_archive_url, ) @@ -260,7 +258,7 @@ def test_is_accessible_missing_file(): # --------------------------------------------------------------------------- -# ArchiveLocalRepo.extract – basic smoke test +# ArchiveLocalRepo.extract - basic smoke test # --------------------------------------------------------------------------- diff --git a/tests/test_patch.py b/tests/test_patch.py index 4c9d88d0..2901aa07 100644 --- a/tests/test_patch.py +++ b/tests/test_patch.py @@ -247,7 +247,7 @@ def test_reverse_patch_zero_length_hunk(): assert _reverse_patch(patch) == expected -# Random small file: 5–15 lines, each line 5–20 chars (filtered to exclude control chars) +# Random small file: 5-15 lines, each line 5-20 chars (filtered to exclude control chars) st_file_lines = st.lists( st.text( min_size=5, From 17cbeff2729d28deda9a2693ef526368821bac2c Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 21 Mar 2026 07:42:21 +0000 Subject: [PATCH 13/35] Fall back to manifest tag/revision/hash when metadata fields are empty (#23) When a metadata file exists but both tag and revision are unset (e.g. the project was only tracked by branch), _determine_version now consults the manifest's tag, revision, and hash before returning an empty string. https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/commands/report.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dfetch/commands/report.py b/dfetch/commands/report.py index 261f89c3..aa3fcd67 100644 --- a/dfetch/commands/report.py +++ b/dfetch/commands/report.py @@ -116,7 +116,14 @@ def _determine_version(project: ProjectEntry) -> str: """ try: metadata = Metadata.from_file(Metadata.from_project_entry(project).path) - version = metadata.tag or metadata.revision or "" + version = ( + metadata.tag + or metadata.revision + or project.tag + or project.revision + or project.hash + or "" + ) except FileNotFoundError: version = project.tag or project.revision or project.hash or "" return version From 1ae6367ca2a531b7029659022ac2ba7847f6b80c Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 21 Mar 2026 07:45:20 +0000 Subject: [PATCH 14/35] Set component.group for GitHub and Bitbucket SBOM components (#23) The github and bitbucket branches in _apply_external_references were inconsistently omitting component.group, while _apply_archive_refs and _apply_vcs_refs both set it. The PURL namespace (org/workspace) is now stored in component.group for all component types. https://claude.ai/code/session_01Mje1g91xprnER7WcUxWFXm --- dfetch/reporting/sbom_reporter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 931d4346..5a456d63 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -258,6 +258,7 @@ def _apply_external_references( ) -> None: """Attach external references to *component* based on its PURL type.""" if purl.type == "github": + component.group = purl.namespace component.external_references.add( ExternalReference( type=ExternalReferenceType.VCS, @@ -265,6 +266,7 @@ def _apply_external_references( ) ) elif purl.type == "bitbucket": + component.group = purl.namespace component.external_references.add( ExternalReference( type=ExternalReferenceType.VCS, From a748173eabe407c62f683a5fb57f312c10caf673 Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 20 Mar 2026 23:08:56 +0000 Subject: [PATCH 15/35] Cleanup --- dfetch/project/archivesubproject.py | 61 ++++++++++++----------------- dfetch/vcs/archive.py | 45 ++++++++++++++++++--- 2 files changed, 65 insertions(+), 41 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 2f928c91..987b7589 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -56,7 +56,6 @@ SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, - compute_hash, is_archive_url, ) @@ -115,13 +114,15 @@ def get_default_branch(self) -> str: """Archives have no branches; return an empty string.""" return "" - def _latest_revision_on_branch(self, branch: str) -> str: # noqa: ARG002 + def _latest_revision_on_branch(self, branch: str) -> str: """For archives the 'latest revision' is always the URL (or hash).""" + del branch return self.remote def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: """Download the archive to a temporary file and return its hash. + The hash is computed during the download stream — no extra file read. The temporary file is always cleaned up, even on error. Raises: @@ -130,34 +131,21 @@ def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) os.close(fd) try: - self._remote_repo.download(tmp_path) - return compute_hash(tmp_path, algorithm) + return self._remote_repo.download(tmp_path, algorithm=algorithm) finally: try: os.remove(tmp_path) except OSError: pass - def _does_revision_exist(self, revision: str) -> bool: - """Check whether *revision* (a hash or URL string) is still valid. + def _does_revision_exist(self, revision: str) -> bool: # noqa: ARG002 + """Check whether the archive URL is still reachable. - * If *revision* starts with a known hash algorithm prefix (e.g. - ``sha256:``) **the entire archive is downloaded** to a temporary file - and its hash is verified against *revision*. This is intentionally - thorough — a lightweight HEAD check cannot confirm content integrity. - * Otherwise *revision* is treated as the URL itself and a lightweight - reachability check is performed via :meth:`ArchiveRemote.is_accessible`. + A lightweight HEAD (or partial-GET) reachability check is used for + all revision types, including hash-pinned ones. Full content-integrity + verification is intentionally deferred to fetch time (``_fetch_impl``), + keeping ``dfetch check`` fast even for large archives over slow links. """ - for algo in SUPPORTED_HASH_ALGORITHMS: - if revision.startswith(f"{algo}:"): - expected_hex = revision.split(":", 1)[1] - try: - actual = self._download_and_compute_hash(algo) - return _safe_compare_hex(actual, expected_hex) - except RuntimeError: - return False - - # revision is the URL - just check accessibility return self._remote_repo.is_accessible() def _list_of_tags(self) -> list[str]: @@ -191,28 +179,31 @@ def _fetch_impl(self, version: Version) -> Version: Returns: The version that was actually fetched (hash string or URL). """ - expected_hash = self._project_entry.hash + revision = version.revision pathlib.Path(self.local_path).mkdir(parents=True, exist_ok=True) fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) os.close(fd) try: - self._remote_repo.download(tmp_path) - - if expected_hash: - if ":" not in expected_hash: - raise RuntimeError( - f"Malformed integrity.hash for {self._project_entry.name!r}: " - f"expected ':', got {expected_hash!r}" - ) - algorithm, expected_hex = expected_hash.split(":", 1) - actual_hex = compute_hash(tmp_path, algorithm) + hash_algo = next( + ( + algo + for algo in SUPPORTED_HASH_ALGORITHMS + if revision.startswith(f"{algo}:") + ), + None, + ) + if hash_algo: + expected_hex = revision.split(":", 1)[1] + actual_hex = self._remote_repo.download(tmp_path, algorithm=hash_algo) if not _safe_compare_hex(actual_hex, expected_hex): raise RuntimeError( f"Hash mismatch for {self._project_entry.name}! " - f"{algorithm} expected {expected_hex}" + f"{hash_algo} expected {expected_hex}" ) + else: + self._remote_repo.download(tmp_path) ArchiveLocalRepo.extract( tmp_path, @@ -226,7 +217,7 @@ def _fetch_impl(self, version: Version) -> Version: except OSError: pass - return Version(revision=expected_hash if expected_hash else self.remote) + return version def freeze_project(self, project: ProjectEntry) -> str | None: """Pin *project* to a cryptographic hash of the archive. diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 4f9a631a..3cc41db1 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -33,6 +33,7 @@ import urllib.parse import zipfile from collections.abc import Sequence +from typing import overload from dfetch.log import get_logger from dfetch.util.util import ( @@ -141,32 +142,62 @@ def _is_http_reachable(self, parsed: urllib.parse.ParseResult) -> bool: return False return False - def download(self, dest_path: str) -> None: - """Download the archive to *dest_path*. + @overload + def download(self, dest_path: str, algorithm: str) -> str: ... + @overload + def download(self, dest_path: str, algorithm: None = ...) -> None: ... + + def download(self, dest_path: str, algorithm: str | None = None) -> str | None: + """Download the archive to *dest_path*, optionally computing its hash. + + When *algorithm* is given the hash is computed during the download + stream (zero extra file reads) and the hex digest is returned. Args: dest_path: Local file path to write the archive to. + algorithm: Hash algorithm name (e.g. ``"sha256"``). When *None* + no hash is computed and *None* is returned. + + Returns: + Hex digest string when *algorithm* is provided, else *None*. Raises: RuntimeError: On download failure or unsupported URL scheme. """ + hasher = hashlib.new(algorithm) if algorithm else None parsed = urllib.parse.urlparse(self.url) if parsed.scheme == "file": try: - shutil.copy(parsed.path, dest_path) + if hasher: + with open(parsed.path, "rb") as src, open(dest_path, "wb") as dst: + for chunk in iter(lambda: src.read(65536), b""): + dst.write(chunk) + hasher.update(chunk) + else: + shutil.copy(parsed.path, dest_path) except OSError as exc: raise RuntimeError( f"'{self.url}' is not a valid URL or unreachable: {exc}" ) from exc elif parsed.scheme in ("http", "https"): - self._http_download(parsed, dest_path) + self._http_download(parsed, dest_path, hasher=hasher) else: raise RuntimeError( f"'{self.url}' uses unsupported scheme '{parsed.scheme}'." ) + return hasher.hexdigest() if hasher else None + + def _http_download( + self, + parsed: urllib.parse.ParseResult, + dest_path: str, + hasher: hashlib._Hash | None = None, + ) -> None: + """Download an HTTP/HTTPS resource to *dest_path*. - def _http_download(self, parsed: urllib.parse.ParseResult, dest_path: str) -> None: - """Download an HTTP/HTTPS resource to *dest_path*.""" + When *hasher* is provided each chunk is fed into it during streaming, + so the caller gets the hash without an extra file read. + """ conn = _http_conn(parsed.scheme, parsed.netloc, timeout=60) try: conn.request("GET", _resource_path(parsed)) @@ -176,6 +207,8 @@ def _http_download(self, parsed: urllib.parse.ParseResult, dest_path: str) -> No with open(dest_path, "wb") as fh: while chunk := resp.read(65536): fh.write(chunk) + if hasher: + hasher.update(chunk) except (OSError, http.client.HTTPException) as exc: raise RuntimeError( f"'{self.url}' is not a valid URL or unreachable: {exc}" From 328dea938c465046a6a7e83e839cd81637ffae63 Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 20 Mar 2026 23:18:20 +0000 Subject: [PATCH 16/35] IntegretyHash class --- dfetch/project/archivesubproject.py | 55 ++++++++----------------- dfetch/reporting/sbom_reporter.py | 9 +++-- dfetch/vcs/archive.py | 63 +++++++++++++++++++++++++++++ tests/test_archive.py | 21 +++++----- 4 files changed, 97 insertions(+), 51 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 987b7589..cd0371a7 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -41,7 +41,6 @@ from __future__ import annotations -import hmac import http.client import os import pathlib @@ -53,24 +52,15 @@ from dfetch.project.subproject import SubProject from dfetch.vcs.archive import ( ARCHIVE_EXTENSIONS, - SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, + IntegrityHash, is_archive_url, ) logger = get_logger(__name__) -def _safe_compare_hex(actual: str, expected: str) -> bool: - """Constant-time comparison of two hex digest strings. - - Uses :func:`hmac.compare_digest` to avoid leaking timing information about - the expected hash value. - """ - return hmac.compare_digest(actual.lower(), expected.lower()) - - def _suffix_for_url(url: str) -> str: """Return the archive file suffix for *url* (e.g. ``'.tar.gz'``, ``'.zip'``).""" lower = url.lower() @@ -119,8 +109,8 @@ def _latest_revision_on_branch(self, branch: str) -> str: del branch return self.remote - def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: - """Download the archive to a temporary file and return its hash. + def _download_and_compute_hash(self, algorithm: str = "sha256") -> IntegrityHash: + """Download the archive to a temporary file and return its :class:`IntegrityHash`. The hash is computed during the download stream — no extra file read. The temporary file is always cleaned up, even on error. @@ -131,7 +121,8 @@ def _download_and_compute_hash(self, algorithm: str = "sha256") -> str: fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) os.close(fd) try: - return self._remote_repo.download(tmp_path, algorithm=algorithm) + hex_digest = self._remote_repo.download(tmp_path, algorithm=algorithm) + return IntegrityHash(algorithm, hex_digest) finally: try: os.remove(tmp_path) @@ -186,21 +177,15 @@ def _fetch_impl(self, version: Version) -> Version: fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) os.close(fd) try: - hash_algo = next( - ( - algo - for algo in SUPPORTED_HASH_ALGORITHMS - if revision.startswith(f"{algo}:") - ), - None, - ) - if hash_algo: - expected_hex = revision.split(":", 1)[1] - actual_hex = self._remote_repo.download(tmp_path, algorithm=hash_algo) - if not _safe_compare_hex(actual_hex, expected_hex): + expected = IntegrityHash.parse(revision) + if expected: + actual_hex = self._remote_repo.download( + tmp_path, algorithm=expected.algorithm + ) + if not expected.matches(actual_hex): raise RuntimeError( f"Hash mismatch for {self._project_entry.name}! " - f"{hash_algo} expected {expected_hex}" + f"{expected.algorithm} expected {expected.hex_digest}" ) else: self._remote_repo.download(tmp_path) @@ -244,17 +229,11 @@ def freeze_project(self, project: ProjectEntry) -> str | None: revision = on_disk.revision - # Already hash-pinned - revision is "sha256:" - if revision.startswith(tuple(f"{a}:" for a in SUPPORTED_HASH_ALGORITHMS)): - if project.hash == revision: - return None - project.hash = revision - return revision - - # URL-pinned: download the archive now and compute its hash. - # Raises RuntimeError on failure so the caller (freeze.py) can log it. - hex_value = self._download_and_compute_hash("sha256") - new_hash = f"sha256:{hex_value}" + # Already hash-pinned — use the on-disk revision directly. + pinned = IntegrityHash.parse(revision) or self._download_and_compute_hash( + "sha256" + ) + new_hash = str(pinned) if project.hash == new_hash: return None project.hash = new_hash diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 5a456d63..43ec2ba6 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -115,6 +115,7 @@ from dfetch.reporting.reporter import Reporter from dfetch.util.license import License from dfetch.util.purl import DFETCH_TO_CDX_HASH_ALGORITHM +from dfetch.vcs.archive import IntegrityHash # PyRight is pedantic with decorators see https://github.com/madpah/serializable/issues/8 # It might be fixable with https://github.com/microsoft/pyright/discussions/4426, would prefer @@ -291,14 +292,14 @@ def _apply_archive_refs( url=XsUri(download_url), ) ) - if version and ":" in version: - algo_prefix, hex_value = version.split(":", 1) - cdx_algo_name = DFETCH_TO_CDX_HASH_ALGORITHM.get(algo_prefix) + integrity = IntegrityHash.parse(version) if version else None + if integrity: + cdx_algo_name = DFETCH_TO_CDX_HASH_ALGORITHM.get(integrity.algorithm) if cdx_algo_name: component.hashes.add( HashType( alg=HashAlgorithm(cdx_algo_name), - content=hex_value, + content=integrity.hex_digest, ) ) diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 3cc41db1..b991bec2 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -23,6 +23,7 @@ from __future__ import annotations import hashlib +import hmac import http.client import os import pathlib @@ -50,6 +51,68 @@ logger = get_logger(__name__) + +class IntegrityHash: + """A parsed ``:`` integrity hash value. + + Use :meth:`parse` to build one from a raw string (returns *None* when the + string does not match a known algorithm prefix). Use the constructor when + both parts are already known. + + >>> h = IntegrityHash.parse("sha256:abc123") + >>> h.algorithm, h.hex_digest + ('sha256', 'abc123') + >>> str(h) + 'sha256:abc123' + """ + + def __init__(self, algorithm: str, hex_digest: str) -> None: + """Create an IntegrityHash from known *algorithm* and *hex_digest*.""" + self.algorithm = algorithm + self.hex_digest = hex_digest + + @classmethod + def parse(cls, value: str) -> IntegrityHash | None: + """Return an :class:`IntegrityHash` when *value* is ``:``. + + Returns *None* when *value* does not start with a known algorithm + prefix (i.e. it is treated as a plain URL revision). + """ + for algo in SUPPORTED_HASH_ALGORITHMS: + if value.startswith(f"{algo}:"): + return cls(algo, value[len(algo) + 1 :]) + return None + + def __str__(self) -> str: + """Return the canonical ``:`` string.""" + return f"{self.algorithm}:{self.hex_digest}" + + def __repr__(self) -> str: + """Return a developer-readable representation.""" + return f"IntegrityHash({self.algorithm!r}, {self.hex_digest!r})" + + def __eq__(self, other: object) -> bool: + """Compare two :class:`IntegrityHash` instances (case-insensitive hex).""" + if isinstance(other, IntegrityHash): + return ( + self.algorithm == other.algorithm + and self.hex_digest.lower() == other.hex_digest.lower() + ) + return NotImplemented + + def __hash__(self) -> int: + """Hash based on algorithm and lower-cased hex digest.""" + return hash((self.algorithm, self.hex_digest.lower())) + + def matches(self, actual_hex: str) -> bool: + """Return *True* when *actual_hex* equals this hash's digest. + + Uses :func:`hmac.compare_digest` for constant-time comparison to + avoid leaking timing information about the expected value. + """ + return hmac.compare_digest(actual_hex.lower(), self.hex_digest.lower()) + + # Safety limits applied during extraction to prevent decompression bombs. _MAX_UNCOMPRESSED_BYTES = 500 * 1024 * 1024 # 500 MB _MAX_MEMBER_COUNT = 10_000 diff --git a/tests/test_archive.py b/tests/test_archive.py index 421b0367..59cbb7e1 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -8,12 +8,13 @@ import pytest -from dfetch.project.archivesubproject import _safe_compare_hex, _suffix_for_url +from dfetch.project.archivesubproject import _suffix_for_url from dfetch.vcs.archive import ( ARCHIVE_EXTENSIONS, SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, + IntegrityHash, compute_hash, is_archive_url, ) @@ -77,21 +78,23 @@ def test_compute_hash_default_is_sha256(): # --------------------------------------------------------------------------- -# _safe_compare_hex +# IntegrityHash.matches # --------------------------------------------------------------------------- -def test_safe_compare_hex_equal(): - h = "a" * 64 - assert _safe_compare_hex(h, h) is True +def test_integrity_hash_matches_equal(): + h = IntegrityHash("sha256", "a" * 64) + assert h.matches("a" * 64) is True -def test_safe_compare_hex_case_insensitive(): - assert _safe_compare_hex("ABCDEF", "abcdef") is True +def test_integrity_hash_matches_case_insensitive(): + h = IntegrityHash("sha256", "abcdef") + assert h.matches("ABCDEF") is True -def test_safe_compare_hex_not_equal(): - assert _safe_compare_hex("a" * 64, "b" * 64) is False +def test_integrity_hash_matches_not_equal(): + h = IntegrityHash("sha256", "a" * 64) + assert h.matches("b" * 64) is False # --------------------------------------------------------------------------- From c3fe25c59c70781822dcd1b8e9d73f8a2b302e1e Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 20 Mar 2026 23:26:30 +0000 Subject: [PATCH 17/35] Create integrety_hash module --- dfetch/project/archivesubproject.py | 2 +- dfetch/reporting/sbom_reporter.py | 2 +- dfetch/util/purl.py | 2 + dfetch/vcs/archive.py | 91 ----------------------- dfetch/vcs/integrity_hash.py | 69 +++++++++++++++++ tests/test_archive.py | 79 -------------------- tests/test_integrity_hash.py | 111 ++++++++++++++++++++++++++++ 7 files changed, 184 insertions(+), 172 deletions(-) create mode 100644 dfetch/vcs/integrity_hash.py create mode 100644 tests/test_integrity_hash.py diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index cd0371a7..59dbaafd 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -54,9 +54,9 @@ ARCHIVE_EXTENSIONS, ArchiveLocalRepo, ArchiveRemote, - IntegrityHash, is_archive_url, ) +from dfetch.vcs.integrity_hash import IntegrityHash logger = get_logger(__name__) diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 43ec2ba6..9df6dc53 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -115,7 +115,7 @@ from dfetch.reporting.reporter import Reporter from dfetch.util.license import License from dfetch.util.purl import DFETCH_TO_CDX_HASH_ALGORITHM -from dfetch.vcs.archive import IntegrityHash +from dfetch.vcs.integrity_hash import IntegrityHash # PyRight is pedantic with decorators see https://github.com/madpah/serializable/issues/8 # It might be fixable with https://github.com/microsoft/pyright/discussions/4426, would prefer diff --git a/dfetch/util/purl.py b/dfetch/util/purl.py index f2b528c7..d68ab5e1 100644 --- a/dfetch/util/purl.py +++ b/dfetch/util/purl.py @@ -41,6 +41,8 @@ # Map from dfetch hash-field algorithm prefix to CycloneDX HashAlgorithm name DFETCH_TO_CDX_HASH_ALGORITHM: dict[str, str] = { "sha256": "SHA-256", + "sha384": "SHA-384", + "sha512": "SHA-512", } # Name given to a package or group if it is not extractable from the URL diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index b991bec2..a4d53996 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -23,7 +23,6 @@ from __future__ import annotations import hashlib -import hmac import http.client import os import pathlib @@ -46,73 +45,8 @@ #: Archive file extensions recognised by DFetch. ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") -#: Hash algorithms supported by the ``integrity.hash`` manifest field. -SUPPORTED_HASH_ALGORITHMS = ("sha256",) - logger = get_logger(__name__) - -class IntegrityHash: - """A parsed ``:`` integrity hash value. - - Use :meth:`parse` to build one from a raw string (returns *None* when the - string does not match a known algorithm prefix). Use the constructor when - both parts are already known. - - >>> h = IntegrityHash.parse("sha256:abc123") - >>> h.algorithm, h.hex_digest - ('sha256', 'abc123') - >>> str(h) - 'sha256:abc123' - """ - - def __init__(self, algorithm: str, hex_digest: str) -> None: - """Create an IntegrityHash from known *algorithm* and *hex_digest*.""" - self.algorithm = algorithm - self.hex_digest = hex_digest - - @classmethod - def parse(cls, value: str) -> IntegrityHash | None: - """Return an :class:`IntegrityHash` when *value* is ``:``. - - Returns *None* when *value* does not start with a known algorithm - prefix (i.e. it is treated as a plain URL revision). - """ - for algo in SUPPORTED_HASH_ALGORITHMS: - if value.startswith(f"{algo}:"): - return cls(algo, value[len(algo) + 1 :]) - return None - - def __str__(self) -> str: - """Return the canonical ``:`` string.""" - return f"{self.algorithm}:{self.hex_digest}" - - def __repr__(self) -> str: - """Return a developer-readable representation.""" - return f"IntegrityHash({self.algorithm!r}, {self.hex_digest!r})" - - def __eq__(self, other: object) -> bool: - """Compare two :class:`IntegrityHash` instances (case-insensitive hex).""" - if isinstance(other, IntegrityHash): - return ( - self.algorithm == other.algorithm - and self.hex_digest.lower() == other.hex_digest.lower() - ) - return NotImplemented - - def __hash__(self) -> int: - """Hash based on algorithm and lower-cased hex digest.""" - return hash((self.algorithm, self.hex_digest.lower())) - - def matches(self, actual_hex: str) -> bool: - """Return *True* when *actual_hex* equals this hash's digest. - - Uses :func:`hmac.compare_digest` for constant-time comparison to - avoid leaking timing information about the expected value. - """ - return hmac.compare_digest(actual_hex.lower(), self.hex_digest.lower()) - - # Safety limits applied during extraction to prevent decompression bombs. _MAX_UNCOMPRESSED_BYTES = 500 * 1024 * 1024 # 500 MB _MAX_MEMBER_COUNT = 10_000 @@ -136,31 +70,6 @@ def is_archive_url(url: str) -> bool: return any(url.lower().endswith(ext) for ext in ARCHIVE_EXTENSIONS) -def compute_hash(path: str, algorithm: str = "sha256") -> str: - """Compute the hex digest of *path* using *algorithm*. - - Args: - path: Path to the file. - algorithm: Hash algorithm name (e.g. ``"sha256"``). - - Returns: - Lowercase hex digest string. - - Raises: - RuntimeError: When *algorithm* is not supported. - """ - if algorithm not in SUPPORTED_HASH_ALGORITHMS: - raise RuntimeError( - f"Unsupported hash algorithm '{algorithm}'. " - f"Supported: {', '.join(SUPPORTED_HASH_ALGORITHMS)}" - ) - h = hashlib.new(algorithm) - with open(path, "rb") as fh: - for chunk in iter(lambda: fh.read(65536), b""): - h.update(chunk) - return h.hexdigest() - - class ArchiveRemote: """Represents a remote archive (tar/zip) URL. diff --git a/dfetch/vcs/integrity_hash.py b/dfetch/vcs/integrity_hash.py new file mode 100644 index 00000000..9fcf4e45 --- /dev/null +++ b/dfetch/vcs/integrity_hash.py @@ -0,0 +1,69 @@ +"""Integrity hash: a ``:`` content fingerprint.""" + +from __future__ import annotations + +import hmac + +#: Supported hash algorithms, ordered strongest-first so :meth:`IntegrityHash.parse` +#: matches the most specific prefix when algorithm names share a common prefix. +SUPPORTED_HASH_ALGORITHMS = ("sha512", "sha384", "sha256") + + +class IntegrityHash: + """A parsed ``:`` integrity hash value. + + Use :meth:`parse` to build one from a raw string (returns *None* when the + string does not match a known algorithm prefix). Use the constructor when + both parts are already known. + + >>> h = IntegrityHash.parse("sha256:abc123") + >>> h.algorithm, h.hex_digest + ('sha256', 'abc123') + >>> str(h) + 'sha256:abc123' + """ + + def __init__(self, algorithm: str, hex_digest: str) -> None: + """Create an IntegrityHash from known *algorithm* and *hex_digest*.""" + self.algorithm = algorithm + self.hex_digest = hex_digest + + @classmethod + def parse(cls, value: str) -> IntegrityHash | None: + """Return an :class:`IntegrityHash` when *value* is ``:``. + + Returns *None* when *value* does not start with a known algorithm prefix. + """ + for algo in SUPPORTED_HASH_ALGORITHMS: + if value.startswith(f"{algo}:"): + return cls(algo, value[len(algo) + 1 :]) + return None + + def __str__(self) -> str: + """Return the canonical ``:`` string.""" + return f"{self.algorithm}:{self.hex_digest}" + + def __repr__(self) -> str: + """Return a developer-readable representation.""" + return f"IntegrityHash({self.algorithm!r}, {self.hex_digest!r})" + + def __eq__(self, other: object) -> bool: + """Compare two :class:`IntegrityHash` instances (case-insensitive hex).""" + if isinstance(other, IntegrityHash): + return ( + self.algorithm == other.algorithm + and self.hex_digest.lower() == other.hex_digest.lower() + ) + return NotImplemented + + def __hash__(self) -> int: + """Hash based on algorithm and lower-cased hex digest.""" + return hash((self.algorithm, self.hex_digest.lower())) + + def matches(self, actual_hex: str) -> bool: + """Return *True* when *actual_hex* equals this hash's digest. + + Uses :func:`hmac.compare_digest` for constant-time comparison to + avoid leaking timing information about the expected value. + """ + return hmac.compare_digest(actual_hex.lower(), self.hex_digest.lower()) diff --git a/tests/test_archive.py b/tests/test_archive.py index 59cbb7e1..dedc422f 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -11,11 +11,8 @@ from dfetch.project.archivesubproject import _suffix_for_url from dfetch.vcs.archive import ( ARCHIVE_EXTENSIONS, - SUPPORTED_HASH_ALGORITHMS, ArchiveLocalRepo, ArchiveRemote, - IntegrityHash, - compute_hash, is_archive_url, ) @@ -25,78 +22,6 @@ _check_tar_members = ArchiveLocalRepo._check_tar_members -# --------------------------------------------------------------------------- -# compute_hash -# --------------------------------------------------------------------------- - - -def test_compute_hash_empty_file(): - with tempfile.NamedTemporaryFile(delete=False) as f: - path = f.name - try: - digest = compute_hash(path, "sha256") - # SHA-256 of empty string - assert ( - digest == "e3b0c44298fc1c149afbf4c8996fb924" - "27ae41e4649b934ca495991b7852b855" - ) - finally: - os.remove(path) - - -def test_compute_hash_known_content(): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(b"hello world\n") - path = f.name - try: - digest = compute_hash(path, "sha256") - assert len(digest) == 64 - assert all(c in "0123456789abcdef" for c in digest) - finally: - os.remove(path) - - -def test_compute_hash_unsupported_algorithm(): - with tempfile.NamedTemporaryFile(delete=False) as f: - path = f.name - try: - with pytest.raises(RuntimeError, match="Unsupported hash algorithm"): - compute_hash(path, "md5") - finally: - os.remove(path) - - -def test_compute_hash_default_is_sha256(): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(b"data") - path = f.name - try: - digest = compute_hash(path) - assert len(digest) == 64 - finally: - os.remove(path) - - -# --------------------------------------------------------------------------- -# IntegrityHash.matches -# --------------------------------------------------------------------------- - - -def test_integrity_hash_matches_equal(): - h = IntegrityHash("sha256", "a" * 64) - assert h.matches("a" * 64) is True - - -def test_integrity_hash_matches_case_insensitive(): - h = IntegrityHash("sha256", "abcdef") - assert h.matches("ABCDEF") is True - - -def test_integrity_hash_matches_not_equal(): - h = IntegrityHash("sha256", "a" * 64) - assert h.matches("b" * 64) is False - - # --------------------------------------------------------------------------- # is_archive_url # --------------------------------------------------------------------------- @@ -324,7 +249,3 @@ def test_all_archive_extensions_covered(): assert len(ARCHIVE_EXTENSIONS) > 0 for ext in ARCHIVE_EXTENSIONS: assert ext.startswith(".") - - -def test_supported_hash_algorithms(): - assert "sha256" in SUPPORTED_HASH_ALGORITHMS diff --git a/tests/test_integrity_hash.py b/tests/test_integrity_hash.py new file mode 100644 index 00000000..d0c06261 --- /dev/null +++ b/tests/test_integrity_hash.py @@ -0,0 +1,111 @@ +"""Unit tests for dfetch.vcs.integrity_hash.""" + +import pytest + +from dfetch.vcs.integrity_hash import SUPPORTED_HASH_ALGORITHMS, IntegrityHash + +# --------------------------------------------------------------------------- +# SUPPORTED_HASH_ALGORITHMS +# --------------------------------------------------------------------------- + + +def test_supported_hash_algorithms_contains_sha256(): + assert "sha256" in SUPPORTED_HASH_ALGORITHMS + + +def test_supported_hash_algorithms_contains_sha384(): + assert "sha384" in SUPPORTED_HASH_ALGORITHMS + + +def test_supported_hash_algorithms_contains_sha512(): + assert "sha512" in SUPPORTED_HASH_ALGORITHMS + + +# --------------------------------------------------------------------------- +# IntegrityHash.parse +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "value,expected_algo,expected_hex", + [ + ("sha256:abc123", "sha256", "abc123"), + ("sha384:def456", "sha384", "def456"), + ("sha512:ghi789", "sha512", "ghi789"), + ], +) +def test_parse_valid(value, expected_algo, expected_hex): + h = IntegrityHash.parse(value) + assert h is not None + assert h.algorithm == expected_algo + assert h.hex_digest == expected_hex + + +def test_parse_returns_none_for_url(): + assert IntegrityHash.parse("https://example.com/lib.tar.gz") is None + + +def test_parse_returns_none_for_plain_string(): + assert IntegrityHash.parse("notahash") is None + + +# --------------------------------------------------------------------------- +# IntegrityHash.__str__ / __repr__ +# --------------------------------------------------------------------------- + + +def test_str_roundtrip(): + h = IntegrityHash("sha256", "abc123") + assert str(h) == "sha256:abc123" + + +def test_repr(): + h = IntegrityHash("sha256", "abc123") + assert repr(h) == "IntegrityHash('sha256', 'abc123')" + + +# --------------------------------------------------------------------------- +# IntegrityHash.__eq__ / __hash__ +# --------------------------------------------------------------------------- + + +def test_eq_same(): + assert IntegrityHash("sha256", "abc") == IntegrityHash("sha256", "abc") + + +def test_eq_case_insensitive_hex(): + assert IntegrityHash("sha256", "ABCDEF") == IntegrityHash("sha256", "abcdef") + + +def test_eq_different_digest(): + assert IntegrityHash("sha256", "aaa") != IntegrityHash("sha256", "bbb") + + +def test_eq_non_integrity_hash_returns_not_implemented(): + assert IntegrityHash("sha256", "abc").__eq__("sha256:abc") is NotImplemented + + +def test_hash_usable_in_set(): + a = IntegrityHash("sha256", "abc") + b = IntegrityHash("sha256", "ABC") + assert len({a, b}) == 1 + + +# --------------------------------------------------------------------------- +# IntegrityHash.matches +# --------------------------------------------------------------------------- + + +def test_matches_equal(): + h = IntegrityHash("sha256", "a" * 64) + assert h.matches("a" * 64) is True + + +def test_matches_case_insensitive(): + h = IntegrityHash("sha256", "abcdef") + assert h.matches("ABCDEF") is True + + +def test_matches_not_equal(): + h = IntegrityHash("sha256", "a" * 64) + assert h.matches("b" * 64) is False From 13dec1ac2c036cd020aa6bc7051fa348298f82e6 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 13:43:50 +0000 Subject: [PATCH 18/35] Review comments --- dfetch/project/subproject.py | 6 ++- dfetch/util/util.py | 15 +++++-- features/steps/generic_steps.py | 15 +++++-- tests/test_subproject.py | 70 +++++++++++++++++++++++++++++++++ tests/test_util.py | 56 ++++++++++++++++++++++++++ 5 files changed, 153 insertions(+), 9 deletions(-) create mode 100644 tests/test_util.py diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index 7eb1622f..9c705d10 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -403,7 +403,11 @@ def freeze_project(self, project: ProjectEntry) -> str | None: download error). Callers should catch and report these. """ on_disk_version = self.on_disk_version() - if project.version == on_disk_version: + if ( + on_disk_version + and project.version.tag == on_disk_version.tag + and project.version.revision == on_disk_version.revision + ): return None if on_disk_version: project.version = on_disk_version diff --git a/dfetch/util/util.py b/dfetch/util/util.py index 0f265dd3..09d5606c 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -54,11 +54,18 @@ def copy_src_subset( Raises: RuntimeError: When *src* does not exist inside *src_root*. """ + resolved_src_root = os.path.realpath(src_root) src_path = os.path.join(src_root, src) - if os.path.isdir(src_path): - copy_directory_contents(src_path, dest_dir) - elif os.path.isfile(src_path): - shutil.copy2(src_path, os.path.join(dest_dir, os.path.basename(src_path))) + resolved_src_path = os.path.realpath(src_path) + if os.path.commonpath([resolved_src_root, resolved_src_path]) != resolved_src_root: + raise RuntimeError(f"src {src!r} escapes the source root") + if os.path.isdir(resolved_src_path): + copy_directory_contents(resolved_src_path, dest_dir) + elif os.path.isfile(resolved_src_path): + shutil.copy2( + resolved_src_path, + os.path.join(dest_dir, os.path.basename(resolved_src_path)), + ) else: raise RuntimeError(f"src {src!r} was not found in the extracted archive") diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 9c56a241..5c3a7f7b 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -116,10 +116,17 @@ def _json_subset_matches(expected, actual) -> bool: if isinstance(expected, list): if not isinstance(actual, list): return False - return all( - any(_json_subset_matches(exp_item, act_item) for act_item in actual) - for exp_item in expected - ) + matched = [False] * len(actual) + for exp_item in expected: + found = False + for i, act_item in enumerate(actual): + if not matched[i] and _json_subset_matches(exp_item, act_item): + matched[i] = True + found = True + break + if not found: + return False + return True return expected == actual diff --git a/tests/test_subproject.py b/tests/test_subproject.py index b3503c29..2fe9976d 100644 --- a/tests/test_subproject.py +++ b/tests/test_subproject.py @@ -135,6 +135,76 @@ def test_are_there_local_changes( ) +@pytest.mark.parametrize( + "name, project_version, on_disk_version, expect_return, expect_project_version", + [ + ( + "already-pinned-tag-matches", + Version(tag="v1.0", branch="main"), + Version(tag="v1.0", branch="main"), + None, + Version(tag="v1.0", branch="main"), + ), + ( + "already-pinned-tag-matches-branch-differs", + Version(tag="v1.0"), + Version(tag="v1.0", branch="main"), + None, + Version(tag="v1.0"), + ), + ( + "already-pinned-revision-matches-branch-differs", + Version(revision="abc123"), + Version(revision="abc123", branch="feature"), + None, + Version(revision="abc123"), + ), + ( + "tag-differs-triggers-freeze", + Version(tag="v1.0"), + Version(tag="v2.0", branch="main"), + "v2.0", + Version(tag="v2.0", branch="main"), + ), + ( + "revision-differs-triggers-freeze", + Version(revision="abc123"), + Version(revision="def456", branch="main"), + "def456", + Version(revision="def456", branch="main"), + ), + ( + "no-on-disk-version", + Version(tag="v1.0"), + None, + None, + Version(tag="v1.0"), + ), + ], +) +def test_freeze_project( + name: str, + project_version: Version, + on_disk_version: Union[Version, None], + expect_return: Union[str, None], + expect_project_version: Version, +): + with patch("dfetch.project.subproject.os.path.exists") as mocked_path_exists: + with patch("dfetch.project.subproject.Metadata.from_file") as mocked_metadata: + subproject = ConcreteSubProject(ProjectEntry({"name": "proj1"})) + + mocked_path_exists.return_value = bool(on_disk_version) + mocked_metadata().version = on_disk_version + + project = ProjectEntry({"name": "proj1"}) + project.version = project_version + + result = subproject.freeze_project(project) + + assert result == expect_return + assert project.version == expect_project_version + + @pytest.mark.parametrize( "ci_env_value, expected_result", [ diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 00000000..e346515f --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,56 @@ +"""Unit tests for dfetch.util.util.""" + +# mypy: ignore-errors +# flake8: noqa + +import pytest + +from dfetch.util.util import copy_src_subset + +# --------------------------------------------------------------------------- +# copy_src_subset – path-traversal protection +# --------------------------------------------------------------------------- + + +def test_copy_src_subset_copies_file(tmp_path): + src_root = tmp_path / "src" + src_root.mkdir() + (src_root / "lib.h").write_text("content") + dest = tmp_path / "dest" + dest.mkdir() + + copy_src_subset(str(src_root), str(dest), "lib.h", keep_licenses=False) + + assert (dest / "lib.h").read_text() == "content" + + +def test_copy_src_subset_copies_directory(tmp_path): + src_root = tmp_path / "src" + src_root.mkdir() + sub = src_root / "subdir" + sub.mkdir() + (sub / "a.c").write_text("code") + dest = tmp_path / "dest" + dest.mkdir() + + copy_src_subset(str(src_root), str(dest), "subdir", keep_licenses=False) + + assert (dest / "a.c").read_text() == "code" + + +@pytest.mark.parametrize( + "evil_src", + [ + "../outside.txt", + "../../etc/passwd", + "/etc/passwd", + ], +) +def test_copy_src_subset_rejects_path_traversal(tmp_path, evil_src): + src_root = tmp_path / "src" + src_root.mkdir() + dest = tmp_path / "dest" + dest.mkdir() + + with pytest.raises(RuntimeError): + copy_src_subset(str(src_root), str(dest), evil_src, keep_licenses=False) From 7d7f9e79cc6ae0ce287834c1113ab493a17b1fc0 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 13:50:58 +0000 Subject: [PATCH 19/35] Fix test --- features/report-sbom.feature | 1 + 1 file changed, 1 insertion(+) diff --git a/features/report-sbom.feature b/features/report-sbom.feature index 19a47b63..8e8c65fb 100644 --- a/features/report-sbom.feature +++ b/features/report-sbom.feature @@ -94,6 +94,7 @@ Feature: Create an CycloneDX sbom "url": "https://github.com/cpputest/cpputest" } ], + "group": "cpputest", "licenses": [ { "license": { From ec1068596c63f80c82a98a76fd79960f2f3b2051 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 20:01:04 +0000 Subject: [PATCH 20/35] Add feature test --- dfetch/vcs/archive.py | 59 ++++++++++++++++++++++++------------ features/report-sbom.feature | 35 +++++++++++++++++++++ 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index a4d53996..77cc0451 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -159,34 +159,53 @@ def download(self, dest_path: str, algorithm: str | None = None) -> str | None: ) return hasher.hexdigest() if hasher else None + _MAX_REDIRECTS = 10 + def _http_download( self, parsed: urllib.parse.ParseResult, dest_path: str, hasher: hashlib._Hash | None = None, ) -> None: - """Download an HTTP/HTTPS resource to *dest_path*. + """Download an HTTP/HTTPS resource to *dest_path*, following redirects. - When *hasher* is provided each chunk is fed into it during streaming, - so the caller gets the hash without an extra file read. + Up to :attr:`_MAX_REDIRECTS` 3xx redirects are followed transparently + (e.g. GitHub archive URLs redirect to a CDN). When *hasher* is + provided each chunk is fed into it during streaming, so the caller gets + the hash without an extra file read. """ - conn = _http_conn(parsed.scheme, parsed.netloc, timeout=60) - try: - conn.request("GET", _resource_path(parsed)) - resp = conn.getresponse() - if resp.status != 200: - raise RuntimeError(f"HTTP {resp.status} when downloading '{self.url}'") - with open(dest_path, "wb") as fh: - while chunk := resp.read(65536): - fh.write(chunk) - if hasher: - hasher.update(chunk) - except (OSError, http.client.HTTPException) as exc: - raise RuntimeError( - f"'{self.url}' is not a valid URL or unreachable: {exc}" - ) from exc - finally: - conn.close() + for _ in range(self._MAX_REDIRECTS + 1): + conn = _http_conn(parsed.scheme, parsed.netloc, timeout=60) + try: + conn.request("GET", _resource_path(parsed)) + resp = conn.getresponse() + if resp.status in (301, 302, 303, 307, 308): + location = resp.getheader("Location", "") + if not location: + raise RuntimeError( + f"Redirect with no Location header from '{parsed.geturl()}'" + ) + parsed = urllib.parse.urlparse( + urllib.parse.urljoin(parsed.geturl(), location) + ) + continue + if resp.status != 200: + raise RuntimeError( + f"HTTP {resp.status} when downloading '{self.url}'" + ) + with open(dest_path, "wb") as fh: + while chunk := resp.read(65536): + fh.write(chunk) + if hasher: + hasher.update(chunk) + return + except (OSError, http.client.HTTPException) as exc: + raise RuntimeError( + f"'{self.url}' is not a valid URL or unreachable: {exc}" + ) from exc + finally: + conn.close() + raise RuntimeError(f"Too many redirects when downloading '{self.url}'") class ArchiveLocalRepo: diff --git a/features/report-sbom.feature b/features/report-sbom.feature index 8e8c65fb..e0052aa2 100644 --- a/features/report-sbom.feature +++ b/features/report-sbom.feature @@ -227,3 +227,38 @@ Feature: Create an CycloneDX sbom "specVersion": "1.6" } """ + + Scenario: A fetched archive dependency generates a json sbom with distribution reference + Given the manifest 'dfetch.yaml' + """ + manifest: + version: '0.0' + + projects: + - name: test-repo-headers + url: https://github.com/dfetch-org/test-repo/archive/refs/tags/v1.tar.gz + vcs: archive + ignore: + - '*.md' + - '*.txt' + """ + And all projects are updated + When I run "dfetch report -t sbom" + Then the 'report.json' json file includes + """ + { + "components": [ + { + "name": "test-repo-headers", + "group": "github.com", + "type": "library", + "externalReferences": [ + { + "type": "distribution", + "url": "https://github.com/dfetch-org/test-repo/archive/refs/tags/v1.tar.gz" + } + ] + } + ] + } + """ From 0912cc6fa6736d9f987caef09c8e66c7a014cfd3 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 20:09:01 +0000 Subject: [PATCH 21/35] Add example to example/dfetch.yaml --- example/dfetch.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/example/dfetch.yaml b/example/dfetch.yaml index 7f950abd..25420554 100644 --- a/example/dfetch.yaml +++ b/example/dfetch.yaml @@ -39,6 +39,15 @@ manifest: src: src - name: cpputest-git-rev-only - dst: Tests/cpputest-git-rev-only revision: d14505cc9191fcf17ccbd92af1c3409eb3969890 repo-path: cpputest/cpputest.git # Use external git directly + + - name: cppcheck-archive + remote: github + dst: Tests/cpputest-archive + repo-path: danmar/cppcheck/archive/2.20.0.tar.gz + ignore: + - tests + - .github + integrity: + hash: sha256:7be7992439339017edb551d8e7d2315f9bb57c402da50c2cee9cd0e2724600a1 From dfc577810e0bc87570132030e4a18da9ffe03b57 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 21:33:59 +0000 Subject: [PATCH 22/35] Ensure consistent hash --- dfetch/commands/update.py | 7 +++- dfetch/commands/update_patch.py | 8 +++-- dfetch/project/subproject.py | 25 +++++++++---- dfetch/util/util.py | 19 +++++++--- features/check-archive.feature | 26 ++++++++++++++ tests/test_subproject.py | 37 ++++++++++++++++++- tests/test_update.py | 5 +-- tests/test_util.py | 63 ++++++++++++++++++++++++++++++++- 8 files changed, 171 insertions(+), 19 deletions(-) diff --git a/dfetch/commands/update.py b/dfetch/commands/update.py index 6d108607..9997b973 100644 --- a/dfetch/commands/update.py +++ b/dfetch/commands/update.py @@ -89,9 +89,14 @@ def __call__(self, args: argparse.Namespace) -> None: for project in superproject.manifest.selected_projects(args.projects): with catch_runtime_exceptions(exceptions) as exceptions: self._check_destination(project, destinations) + destination = project.destination + + def _ignored(dst: str = destination) -> list[str]: + return list(superproject.ignored_files(dst)) + dfetch.project.create_sub_project(project).update( force=args.force, - files_to_ignore=superproject.ignored_files(project.destination), + ignored_files_callback=_ignored, ) if not args.no_recommendations and os.path.isdir( diff --git a/dfetch/commands/update_patch.py b/dfetch/commands/update_patch.py index efbb4710..53998602 100644 --- a/dfetch/commands/update_patch.py +++ b/dfetch/commands/update_patch.py @@ -86,8 +86,10 @@ def __call__(self, args: argparse.Namespace) -> None: for project in superproject.manifest.selected_projects(args.projects): with catch_runtime_exceptions(exceptions) as exceptions: subproject = dfetch.project.create_sub_project(project) + destination = project.destination - files_to_ignore = superproject.ignored_files(project.destination) + def _ignored(dst: str = destination) -> list[str]: + return list(superproject.ignored_files(dst)) # Check if the project has a patch, maybe suggest creating one? if not subproject.patch: @@ -118,7 +120,7 @@ def __call__(self, args: argparse.Namespace) -> None: # force update to fetched version from metadata without applying patch subproject.update( force=True, - files_to_ignore=files_to_ignore, + ignored_files_callback=_ignored, patch_count=len(subproject.patch) - 1, ) @@ -141,7 +143,7 @@ def __call__(self, args: argparse.Namespace) -> None: # force update again to fetched version from metadata but with applying patch subproject.update( - force=True, files_to_ignore=files_to_ignore, patch_count=-1 + force=True, ignored_files_callback=_ignored, patch_count=-1 ) if exceptions: diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index 9c705d10..a520b37d 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -3,7 +3,7 @@ import os import pathlib from abc import ABC, abstractmethod -from collections.abc import Sequence +from collections.abc import Callable, Sequence from dfetch.log import get_logger from dfetch.manifest.project import ProjectEntry @@ -90,7 +90,7 @@ def update_is_required(self, force: bool = False) -> Version | None: def update( self, force: bool = False, - files_to_ignore: Sequence[str] | None = None, + ignored_files_callback: Callable[[], Sequence[str]] | None = None, patch_count: int = -1, ) -> None: """Update this subproject if required. @@ -98,7 +98,11 @@ def update( Args: force (bool, optional): Ignore if version is ok or any local changes were done. Defaults to False. - files_to_ignore (Sequence[str], optional): list of files that are ok to overwrite. + ignored_files_callback (Callable, optional): Called to obtain the set of files + to ignore. Invoked twice: once before clearing the destination (to detect + pre-existing local changes) and once after extraction (to compute the stored + hash). Calling it at both points ensures the stored hash and the check-time + hash use the same skiplist, preventing false "local changes" reports. patch_count (int, optional): Number of patches to apply (-1 means all). """ to_fetch = self.update_is_required(force) @@ -106,9 +110,11 @@ def update( if not to_fetch: return - files_to_ignore = files_to_ignore or [] + pre_fetch_ignored = ( + list(ignored_files_callback()) if ignored_files_callback else [] + ) - if not force and self._are_there_local_changes(files_to_ignore): + if not force and self._are_there_local_changes(pre_fetch_ignored): self._log_project( "skipped - local changes after last update (use --force to overwrite)" ) @@ -128,9 +134,16 @@ def update( applied_patches = self._apply_patches(patch_count) + post_fetch_ignored = ( + list(ignored_files_callback()) if ignored_files_callback else [] + ) + self.__metadata.fetched( actually_fetched, - hash_=hash_directory(self.local_path, skiplist=[self.__metadata.FILENAME]), + hash_=hash_directory( + self.local_path, + skiplist=[self.__metadata.FILENAME] + post_fetch_ignored, + ), patch_=applied_patches, ) diff --git a/dfetch/util/util.py b/dfetch/util/util.py index 09d5606c..bcf27332 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -179,18 +179,27 @@ def find_file(name: str, path: str = ".") -> list[str]: def hash_directory(path: str, skiplist: list[str] | None) -> str: - """Hash a directory with all its files.""" + """Hash a directory with all its files. + + Files are visited in a deterministic, sorted order so that the hash is + identical regardless of filesystem traversal order. The relative path of + each file (not just its basename) is included in the hash so that files + with the same name in different sub-directories are distinguished. + """ digest = hashlib.md5(usedforsecurity=False) skiplist = skiplist or [] - for root, _, files in os.walk(path): - for name in files: + for root, dirs, files in os.walk(path): + dirs.sort() # Ensure deterministic directory traversal order + for name in sorted(files): if name not in skiplist: file_path = os.path.join(root, name) + rel_path = os.path.relpath(file_path, path) - # Hash the path and add to the digest to account for empty files/directories + # Hash the relative path to account for empty files/directories + # and to distinguish same-named files in different sub-directories digest.update( - hashlib.md5(name.encode(), usedforsecurity=False).digest() + hashlib.md5(rel_path.encode(), usedforsecurity=False).digest() ) digest = hash_file(file_path, digest) diff --git a/features/check-archive.feature b/features/check-archive.feature index fa8547d0..1ba56760 100644 --- a/features/check-archive.feature +++ b/features/check-archive.feature @@ -92,6 +92,32 @@ Feature: Checking dependencies from an archive > wanted (https://dfetch.invalid/does-not-exist.tar.gz), but not available at the upstream. """ + Scenario: Archive project with ignore list shows no local changes after fresh fetch + Given an archive "SomeProject.tar.gz" with the files + | path | + | README.md | + | src/main.c | + | tests/test_main.c | + And the manifest 'dfetch.yaml' in MyProject + """ + manifest: + version: '0.0' + projects: + - name: SomeProject + url: some-remote-server/SomeProject.tar.gz + vcs: archive + ignore: + - tests + """ + And all projects are updated in MyProject + When I run "dfetch check SomeProject" in MyProject + Then the output shows + """ + Dfetch (0.12.1) + SomeProject: + > up-to-date (some-remote-server/SomeProject.tar.gz) + """ + Scenario: Archive with local changes is reported Given an archive "SomeProject.tar.gz" with the files | path | diff --git a/tests/test_subproject.py b/tests/test_subproject.py index 2fe9976d..3f8bd221 100644 --- a/tests/test_subproject.py +++ b/tests/test_subproject.py @@ -4,7 +4,7 @@ # flake8: noqa from typing import Optional, Union -from unittest.mock import patch +from unittest.mock import MagicMock, call, patch import pytest @@ -135,6 +135,41 @@ def test_are_there_local_changes( ) +def test_update_uses_ignored_files_callback_for_stored_hash(): + """The hash stored after fetch must use the post-fetch ignored files. + + The callback is called twice: once before clearing (pre-fetch local-changes + check) and once after extraction (to compute the stored hash). The second + call returns the post-extraction state so the stored hash matches what + dfetch check will compute later. + """ + pre_fetch_ignored = ["old_file.txt"] + post_fetch_ignored = ["new_ignored.txt"] + + # Return different values on successive calls to simulate pre/post extraction + callback = MagicMock(side_effect=[pre_fetch_ignored, post_fetch_ignored]) + + with patch("dfetch.project.subproject.os.path.exists") as mock_exists: + with patch("dfetch.project.subproject.Metadata.from_file") as mock_meta_file: + with patch("dfetch.project.subproject.hash_directory") as mock_hash: + with patch("dfetch.project.subproject.safe_rm"): + with patch("dfetch.project.subproject.Metadata.dump"): + mock_exists.return_value = True + mock_meta_file.return_value.version = Version(revision="abc") + mock_hash.return_value = "hash123" + + subproject = ConcreteSubProject(ProjectEntry({"name": "p1"})) + subproject._wanted_version = Version(revision="new") + + subproject.update(force=True, ignored_files_callback=callback) + + assert callback.call_count == 2 + # The hash must be computed with the post-fetch ignored list + hash_call_skiplist = mock_hash.call_args[1]["skiplist"] + assert "new_ignored.txt" in hash_call_skiplist + assert "old_file.txt" not in hash_call_skiplist + + @pytest.mark.parametrize( "name, project_version, on_disk_version, expect_return, expect_project_version", [ diff --git a/tests/test_update.py b/tests/test_update.py index aa78e0b4..eef4ac2c 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -5,7 +5,7 @@ import argparse from pathlib import Path -from unittest.mock import Mock, patch +from unittest.mock import ANY, Mock, patch import pytest @@ -75,7 +75,8 @@ def test_forced_update(): update(args) mocked_create.return_value.update.assert_called_once_with( - force=True, files_to_ignore=[] + force=True, + ignored_files_callback=ANY, ) diff --git a/tests/test_util.py b/tests/test_util.py index e346515f..1411a2e0 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -5,7 +5,7 @@ import pytest -from dfetch.util.util import copy_src_subset +from dfetch.util.util import copy_src_subset, hash_directory # --------------------------------------------------------------------------- # copy_src_subset – path-traversal protection @@ -54,3 +54,64 @@ def test_copy_src_subset_rejects_path_traversal(tmp_path, evil_src): with pytest.raises(RuntimeError): copy_src_subset(str(src_root), str(dest), evil_src, keep_licenses=False) + + +# --------------------------------------------------------------------------- +# hash_directory – determinism +# --------------------------------------------------------------------------- + + +def test_hash_directory_is_deterministic(tmp_path): + """hash_directory must return the same value on repeated calls.""" + d = tmp_path / "proj" + d.mkdir() + (d / "a.c").write_text("int main(){}") + (d / "b.h").write_text("#pragma once") + sub = d / "src" + sub.mkdir() + (sub / "util.c").write_text("void util(){}") + + assert hash_directory(str(d), None) == hash_directory(str(d), None) + + +def test_hash_directory_differs_when_file_content_changes(tmp_path): + """Modifying a file must produce a different hash.""" + d = tmp_path / "proj" + d.mkdir() + f = d / "file.txt" + f.write_text("original") + + h1 = hash_directory(str(d), None) + f.write_text("modified") + h2 = hash_directory(str(d), None) + + assert h1 != h2 + + +def test_hash_directory_differs_for_same_name_in_different_subdirs(tmp_path): + """Files with identical names but in different sub-directories must affect the hash.""" + d1 = tmp_path / "proj1" + d1.mkdir() + (d1 / "a").mkdir() + (d1 / "a" / "file.txt").write_text("in a") + + d2 = tmp_path / "proj2" + d2.mkdir() + (d2 / "b").mkdir() + (d2 / "b" / "file.txt").write_text("in a") + + assert hash_directory(str(d1), None) != hash_directory(str(d2), None) + + +def test_hash_directory_skiplist_excludes_file(tmp_path): + """Files listed in skiplist must not contribute to the hash.""" + d = tmp_path / "proj" + d.mkdir() + (d / "tracked.txt").write_text("data") + (d / "ignored.txt").write_text("ignored data") + + h_with_skip = hash_directory(str(d), ["ignored.txt"]) + (d / "ignored.txt").write_text("changed ignored data") + h_with_skip2 = hash_directory(str(d), ["ignored.txt"]) + + assert h_with_skip == h_with_skip2 From 96dfd325aa21d4c0a5b690e6041dceb8464bacb9 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 22:06:11 +0000 Subject: [PATCH 23/35] Centralize path traversal check --- dfetch/commands/format_patch.py | 12 ++++++------ dfetch/commands/update.py | 13 +++++++++---- dfetch/commands/update_patch.py | 10 +++++++--- dfetch/manifest/parse.py | 10 ++++++++-- dfetch/project/gitsuperproject.py | 7 ++----- dfetch/project/superproject.py | 7 ++----- dfetch/project/svnsuperproject.py | 6 ++---- dfetch/util/util.py | 27 ++++++++++++++++++++++++--- dfetch/vcs/archive.py | 29 +++++++++++++++-------------- tests/test_patch.py | 2 +- 10 files changed, 76 insertions(+), 47 deletions(-) diff --git a/dfetch/commands/format_patch.py b/dfetch/commands/format_patch.py index a0358668..ad092624 100644 --- a/dfetch/commands/format_patch.py +++ b/dfetch/commands/format_patch.py @@ -37,7 +37,11 @@ from dfetch.project.gitsubproject import GitSubProject from dfetch.project.subproject import SubProject from dfetch.project.svnsubproject import SvnSubProject -from dfetch.util.util import catch_runtime_exceptions, in_directory +from dfetch.util.util import ( + catch_runtime_exceptions, + check_no_path_traversal, + in_directory, +) from dfetch.vcs.patch import Patch, PatchAuthor, PatchInfo, PatchType logger = get_logger(__name__) @@ -80,11 +84,7 @@ def __call__(self, args: argparse.Namespace) -> None: output_dir_path = pathlib.Path(args.output_directory).resolve() - if not output_dir_path.is_relative_to(superproject.root_directory): - raise RuntimeError( - f"Output directory '{output_dir_path}' must be inside" - f" the superproject root '{superproject.root_directory}'" - ) + check_no_path_traversal(output_dir_path, superproject.root_directory) output_dir_path.mkdir(parents=True, exist_ok=True) diff --git a/dfetch/commands/update.py b/dfetch/commands/update.py index 9997b973..6e44ca35 100644 --- a/dfetch/commands/update.py +++ b/dfetch/commands/update.py @@ -41,7 +41,11 @@ from dfetch.commands.common import check_sub_manifests from dfetch.log import get_logger from dfetch.project import create_super_project -from dfetch.util.util import catch_runtime_exceptions, in_directory +from dfetch.util.util import ( + catch_runtime_exceptions, + check_no_path_traversal, + in_directory, +) logger = get_logger(__name__) @@ -126,8 +130,9 @@ def _check_path_traversal( project: dfetch.manifest.project.ProjectEntry, real_path: str, safe_dir: str ) -> None: """Check if destination is outside the directory tree.""" - if os.path.commonprefix((real_path, safe_dir)) != safe_dir: - # See https://owasp.org/www-community/attacks/Path_Traversal + try: + check_no_path_traversal(real_path, safe_dir) + except RuntimeError: logger.print_warning_line( project.name, f'Skipping, path "{project.destination}" is outside manifest directory tree.', @@ -135,7 +140,7 @@ def _check_path_traversal( raise RuntimeError( "Destination must be in the manifests folder or a subfolder. " f'"{project.destination}" is outside this tree!' - ) + ) from None @staticmethod def _check_dst_not_in_blacklist( diff --git a/dfetch/commands/update_patch.py b/dfetch/commands/update_patch.py index 53998602..c0117180 100644 --- a/dfetch/commands/update_patch.py +++ b/dfetch/commands/update_patch.py @@ -41,7 +41,11 @@ from dfetch.project.gitsuperproject import GitSuperProject from dfetch.project.metadata import Metadata from dfetch.project.superproject import NoVcsSuperProject, RevisionRange -from dfetch.util.util import catch_runtime_exceptions, in_directory +from dfetch.util.util import ( + catch_runtime_exceptions, + check_no_path_traversal, + in_directory, +) logger = get_logger(__name__) @@ -160,8 +164,8 @@ def _update_patch( patch_path = pathlib.Path(patch_to_update).resolve() try: - patch_path.relative_to(root) - except ValueError: + check_no_path_traversal(patch_path, root) + except RuntimeError: logger.print_warning_line( project_name, f'No updating patch "{patch_to_update}" which is outside {root}', diff --git a/dfetch/manifest/parse.py b/dfetch/manifest/parse.py index df29901d..1c0bb4e3 100644 --- a/dfetch/manifest/parse.py +++ b/dfetch/manifest/parse.py @@ -10,7 +10,11 @@ from dfetch.log import get_logger from dfetch.manifest.manifest import Manifest, ManifestDict from dfetch.manifest.schema import MANIFEST_SCHEMA -from dfetch.util.util import find_file, prefix_runtime_exceptions +from dfetch.util.util import ( + check_no_path_traversal, + find_file, + prefix_runtime_exceptions, +) logger = get_logger(__name__) @@ -92,7 +96,9 @@ def get_submanifests(skip: list[str] | None = None) -> list[Manifest]: for path in find_file(DEFAULT_MANIFEST_NAME, root_dir): path = os.path.realpath(path) - if os.path.commonprefix((path, root_dir)) != root_dir: + try: + check_no_path_traversal(path, root_dir) + except RuntimeError: logger.warning(f"Sub-manifest {path} is outside {root_dir}") continue diff --git a/dfetch/project/gitsuperproject.py b/dfetch/project/gitsuperproject.py index ec9f7928..d5e547a9 100644 --- a/dfetch/project/gitsuperproject.py +++ b/dfetch/project/gitsuperproject.py @@ -16,7 +16,7 @@ from dfetch.project.gitsubproject import GitSubProject from dfetch.project.subproject import SubProject from dfetch.project.superproject import RevisionRange, SuperProject -from dfetch.util.util import resolve_absolute_path +from dfetch.util.util import check_no_path_traversal, resolve_absolute_path from dfetch.vcs.git import GitLocalRepo logger = get_logger(__name__) @@ -43,10 +43,7 @@ def ignored_files(self, path: str) -> Sequence[str]: """Return a list of files that can be ignored in a given path.""" resolved_path = resolve_absolute_path(path) - if not resolved_path.is_relative_to(self.root_directory): - raise RuntimeError( - f"{resolved_path} not in superproject {self.root_directory}!" - ) + check_no_path_traversal(resolved_path, self.root_directory) return GitLocalRepo.ignored_files(path) diff --git a/dfetch/project/superproject.py b/dfetch/project/superproject.py index e7c8a199..4d5cce56 100644 --- a/dfetch/project/superproject.py +++ b/dfetch/project/superproject.py @@ -19,7 +19,7 @@ from dfetch.manifest.manifest import Manifest from dfetch.manifest.project import ProjectEntry from dfetch.project.subproject import SubProject -from dfetch.util.util import resolve_absolute_path +from dfetch.util.util import check_no_path_traversal, resolve_absolute_path logger = get_logger(__name__) @@ -136,10 +136,7 @@ def ignored_files(self, path: str) -> Sequence[str]: """Return a list of files that can be ignored in a given path.""" resolved_path = resolve_absolute_path(path) - if not resolved_path.is_relative_to(self.root_directory): - raise RuntimeError( - f"{resolved_path} not in superproject {self.root_directory}!" - ) + check_no_path_traversal(resolved_path, self.root_directory) return [] diff --git a/dfetch/project/svnsuperproject.py b/dfetch/project/svnsuperproject.py index e19aa11f..c3d708bb 100644 --- a/dfetch/project/svnsuperproject.py +++ b/dfetch/project/svnsuperproject.py @@ -17,6 +17,7 @@ from dfetch.project.superproject import RevisionRange, SuperProject from dfetch.project.svnsubproject import SvnSubProject from dfetch.util.util import ( + check_no_path_traversal, in_directory, resolve_absolute_path, ) @@ -47,10 +48,7 @@ def ignored_files(self, path: str) -> Sequence[str]: """Return a list of files that can be ignored in a given path.""" resolved_path = resolve_absolute_path(path) - if not resolved_path.is_relative_to(self.root_directory): - raise RuntimeError( - f"{resolved_path} not in superproject {self.root_directory}!" - ) + check_no_path_traversal(resolved_path, self.root_directory) return SvnRepo.ignored_files(path) diff --git a/dfetch/util/util.py b/dfetch/util/util.py index bcf27332..cb37a39d 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -54,11 +54,9 @@ def copy_src_subset( Raises: RuntimeError: When *src* does not exist inside *src_root*. """ - resolved_src_root = os.path.realpath(src_root) src_path = os.path.join(src_root, src) + check_no_path_traversal(src_path, src_root) resolved_src_path = os.path.realpath(src_path) - if os.path.commonpath([resolved_src_root, resolved_src_path]) != resolved_src_root: - raise RuntimeError(f"src {src!r} escapes the source root") if os.path.isdir(resolved_src_path): copy_directory_contents(resolved_src_path, dest_dir) elif os.path.isfile(resolved_src_path): @@ -243,6 +241,29 @@ def str_if_possible(data: list[str]) -> str | list[str]: return "" if not data else data[0] if len(data) == 1 else data +def check_no_path_traversal(path: str | Path, root: str | Path) -> None: + """Raise *RuntimeError* if *path* escapes *root*. + + Both *path* and *root* are resolved with :func:`os.path.realpath` before + comparison, so symlinks and relative ``..`` components cannot bypass the + check. + + See https://owasp.org/www-community/attacks/Path_Traversal + + Raises: + RuntimeError: When *path* resolves to a location outside *root*. + """ + resolved_root = os.path.realpath(root) + resolved_path = os.path.realpath(path) + try: + escapes = os.path.commonpath([resolved_root, resolved_path]) != resolved_root + except ValueError: + # commonpath raises ValueError on Windows when paths span different drives + escapes = True + if escapes: + raise RuntimeError(f"{str(path)!r} is outside root {str(root)!r}") + + def resolve_absolute_path(path: str | Path) -> Path: """Return a guaranteed absolute Path, resolving symlinks. diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 77cc0451..68a79a5b 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -278,6 +278,19 @@ def _check_archive_limits(member_count: int, total_bytes: int) -> None: f"safety limit of {_MAX_UNCOMPRESSED_BYTES} bytes." ) + @staticmethod + def _check_archive_member_path(name: str) -> None: + """Raise *RuntimeError* if *name* is an unsafe archive member path. + + Rejects absolute paths and any ``..`` path component. + + Raises: + RuntimeError: When *name* is absolute or contains a ``..`` component. + """ + member_path = pathlib.PurePosixPath(name) + if member_path.is_absolute() or any(part == ".." for part in member_path.parts): + raise RuntimeError(f"Archive contains an unsafe member path: {name!r}") + @staticmethod def check_zip_members(zf: zipfile.ZipFile) -> list[zipfile.ZipInfo]: """Validate all ZIP member paths against path-traversal attacks. @@ -295,13 +308,7 @@ def check_zip_members(zf: zipfile.ZipFile) -> list[zipfile.ZipInfo]: len(members), sum(info.file_size for info in members) ) for info in members: - member_path = pathlib.PurePosixPath(info.filename) - if member_path.is_absolute() or any( - part == ".." for part in member_path.parts - ): - raise RuntimeError( - f"Archive contains an unsafe member path: {info.filename!r}" - ) + ArchiveLocalRepo._check_archive_member_path(info.filename) return members @staticmethod @@ -322,13 +329,7 @@ def _check_tar_members(tf: tarfile.TarFile) -> None: len(members), sum(m.size for m in members if m.isfile()) ) for member in members: - member_path = pathlib.PurePosixPath(member.name) - if member_path.is_absolute() or any( - part == ".." for part in member_path.parts - ): - raise RuntimeError( - f"Archive contains an unsafe member path: {member.name!r}" - ) + ArchiveLocalRepo._check_archive_member_path(member.name) @staticmethod def _extract_raw(archive_path: str, dest_dir: str) -> None: diff --git a/tests/test_patch.py b/tests/test_patch.py index 2901aa07..6b408b8a 100644 --- a/tests/test_patch.py +++ b/tests/test_patch.py @@ -253,7 +253,7 @@ def test_reverse_patch_zero_length_hunk(): min_size=5, max_size=20, alphabet=st.characters( - blacklist_categories=("Cc", "Cs"), blacklist_characters="\r\n" + blacklist_categories=("Cc", "Cs", "Zl", "Zp"), blacklist_characters="\r\n" ), ), min_size=5, From c7cfd20aa10c51ad476efaaafb12216b14283f30 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 22:26:02 +0000 Subject: [PATCH 24/35] Review comments --- dfetch/project/subproject.py | 1 + example/dfetch.yaml | 2 +- tests/test_subproject.py | 4 ++-- tests/test_update.py | 8 ++++++++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/dfetch/project/subproject.py b/dfetch/project/subproject.py index a520b37d..5649b92c 100644 --- a/dfetch/project/subproject.py +++ b/dfetch/project/subproject.py @@ -420,6 +420,7 @@ def freeze_project(self, project: ProjectEntry) -> str | None: on_disk_version and project.version.tag == on_disk_version.tag and project.version.revision == on_disk_version.revision + and (bool(project.version.tag) or self.revision_is_enough()) ): return None if on_disk_version: diff --git a/example/dfetch.yaml b/example/dfetch.yaml index 25420554..e2c6b3bd 100644 --- a/example/dfetch.yaml +++ b/example/dfetch.yaml @@ -44,7 +44,7 @@ manifest: - name: cppcheck-archive remote: github - dst: Tests/cpputest-archive + dst: Tests/cppcheck-archive repo-path: danmar/cppcheck/archive/2.20.0.tar.gz ignore: - tests diff --git a/tests/test_subproject.py b/tests/test_subproject.py index 3f8bd221..462086dc 100644 --- a/tests/test_subproject.py +++ b/tests/test_subproject.py @@ -191,8 +191,8 @@ def test_update_uses_ignored_files_callback_for_stored_hash(): "already-pinned-revision-matches-branch-differs", Version(revision="abc123"), Version(revision="abc123", branch="feature"), - None, - Version(revision="abc123"), + "abc123", + Version(revision="abc123", branch="feature"), ), ( "tag-differs-triggers-freeze", diff --git a/tests/test_update.py b/tests/test_update.py index eef4ac2c..f6078185 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -79,6 +79,14 @@ def test_forced_update(): ignored_files_callback=ANY, ) + cb = mocked_create.return_value.update.call_args.kwargs[ + "ignored_files_callback" + ] + cb() + fake_superproject.ignored_files.assert_called_once_with( + "some_dest" + ) + def test_create_menu(): subparsers = argparse.ArgumentParser().add_subparsers() From 3ca0cb6bda0efc9269906a036f3f463aabc04971 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 22:58:48 +0000 Subject: [PATCH 25/35] Support all 3 hash algo's --- dfetch/manifest/schema.py | 4 +++- dfetch/project/archivesubproject.py | 11 ++++++----- dfetch/vcs/archive.py | 7 ++++--- doc/manifest.rst | 5 +++-- features/fetch-archive.feature | 22 +++++++++++++++++++--- features/steps/archive_steps.py | 20 ++++++++++++++++++++ features/steps/generic_steps.py | 4 ++++ features/steps/manifest_steps.py | 4 ++++ features/validate-manifest.feature | 16 ++++++++++++++-- 9 files changed, 77 insertions(+), 16 deletions(-) diff --git a/dfetch/manifest/schema.py b/dfetch/manifest/schema.py index f9dda64a..f7b59f0a 100644 --- a/dfetch/manifest/schema.py +++ b/dfetch/manifest/schema.py @@ -15,7 +15,9 @@ } ) -HASH_STR = Regex(r"^(sha256):[a-fA-F0-9]{64}$") +HASH_STR = Regex( + r"^(sha256:[a-fA-F0-9]{64}|sha384:[a-fA-F0-9]{96}|sha512:[a-fA-F0-9]{128})$" +) # ``integrity:`` block — designed for future extension with ``sig:`` and # ``sig-key:`` fields for detached signature / signing-key verification. diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 59dbaafd..26486a2c 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -17,7 +17,7 @@ The ``integrity:`` block is designed for future extension: ``sig:`` and ``sig-key:`` fields for detached signature / signing-key verification will slot in alongside ``hash:`` without breaking existing manifests. -Only ``sha256`` is supported today. +Supported hash algorithms: ``sha256``, ``sha384``, and ``sha512``. Example manifest entries:: @@ -147,7 +147,7 @@ def _list_of_tags(self) -> list[str]: def wanted_version(self) -> Version: """Version derived from the ``integrity.hash`` field or the archive URL. - * With ``integrity.hash: sha256:`` → ``Version(revision='sha256:')`` + * With ``integrity.hash: :`` → ``Version(revision=':')`` * Without hash → ``Version(revision=)`` This makes the standard :class:`~dfetch.project.subproject.SubProject` @@ -212,11 +212,12 @@ def freeze_project(self, project: ProjectEntry) -> str | None: * If the archive was fetched without a hash (URL-only), the archive is downloaded again, its SHA-256 is computed, and the result is written to ``integrity.hash``. This ensures the manifest always ends up - pinned to a specific content fingerprint. + pinned to a specific content fingerprint. SHA-256 is used as the + default algorithm when no prior hash is present. Returns: - The ``sha256:`` string written to *project*, or *None* if the - manifest was already up-to-date. + The ``:`` string written to *project*, or *None* if + the manifest was already up-to-date. Raises: RuntimeError: On download or hash-computation failure so the caller diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 68a79a5b..b1c2f665 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -5,9 +5,10 @@ :mod:`urllib.request` can reach (``http://``, ``https://``, ``file://``, …). Optional integrity checking is supported via an ``integrity:`` manifest block. -The ``hash:`` sub-field (e.g. ``sha256:``) is supported today; the block -is designed to grow with ``sig:`` and ``sig-key:`` fields for detached -signature / signing-key verification in the future. +The ``hash:`` sub-field accepts ``sha256:`` (64 hex chars), +``sha384:`` (96 hex chars), or ``sha512:`` (128 hex chars). +The block is designed to grow with ``sig:`` and ``sig-key:`` fields for +detached signature / signing-key verification in the future. Example manifest entry:: diff --git a/doc/manifest.rst b/doc/manifest.rst index 3409afd5..a1947f86 100644 --- a/doc/manifest.rst +++ b/doc/manifest.rst @@ -123,6 +123,7 @@ Below an overview of all possible fields on the manifest. The bold items are man description: > Cryptographic hash of the archive file. Format: ``:``. - Currently ``sha256`` is supported (e.g. ``sha256:e3b0c4…``). - The format is designed for future extension to ``sha512``, etc. + Supported algorithms: ``sha256`` (64 hex chars), + ``sha384`` (96 hex chars), and ``sha512`` (128 hex chars). + Example: ``sha256:e3b0c4…``. uniqueItems: true diff --git a/features/fetch-archive.feature b/features/fetch-archive.feature index 564e218a..0f667f19 100644 --- a/features/fetch-archive.feature +++ b/features/fetch-archive.feature @@ -58,7 +58,7 @@ Feature: Fetching dependencies from an archive (tar/zip) dfetch.yaml """ - Scenario: Archive project with sha256 hash verification is fetched + Scenario: Archive projects with sha256, sha384 and sha512 hash verification are fetched Given an archive "SomeProject.tar.gz" with the files | path | | README.md | @@ -67,17 +67,33 @@ Feature: Fetching dependencies from an archive (tar/zip) manifest: version: '0.0' projects: - - name: SomeProject + - name: SomeProject-sha256 url: some-remote-server/SomeProject.tar.gz vcs: archive integrity: hash: sha256: + - name: SomeProject-sha384 + url: some-remote-server/SomeProject.tar.gz + vcs: archive + integrity: + hash: sha384: + - name: SomeProject-sha512 + url: some-remote-server/SomeProject.tar.gz + vcs: archive + integrity: + hash: sha512: """ When I run "dfetch update" in MyProject Then 'MyProject' looks like: """ MyProject/ - SomeProject/ + SomeProject-sha256/ + .dfetch_data.yaml + README.md + SomeProject-sha384/ + .dfetch_data.yaml + README.md + SomeProject-sha512/ .dfetch_data.yaml README.md dfetch.yaml diff --git a/features/steps/archive_steps.py b/features/steps/archive_steps.py index f1817ed9..7b0390aa 100644 --- a/features/steps/archive_steps.py +++ b/features/steps/archive_steps.py @@ -22,6 +22,24 @@ def _sha256(path: str) -> str: return h.hexdigest() +def _sha384(path: str) -> str: + """Return the SHA-384 hex digest of a file.""" + h = hashlib.sha384() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + return h.hexdigest() + + +def _sha512(path: str) -> str: + """Return the SHA-512 hex digest of a file.""" + h = hashlib.sha512() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + return h.hexdigest() + + def create_tar_gz(archive_path: str, name: str, files: list[dict]) -> None: """Create a .tar.gz archive with files nested under a top-level / directory.""" with tarfile.open(archive_path, "w:gz") as tar: @@ -71,6 +89,8 @@ def _create_archive(context, name: str, extension: str) -> None: create_zip(archive_path, name, files) context.archive_sha256 = _sha256(archive_path) + context.archive_sha384 = _sha384(archive_path) + context.archive_sha512 = _sha512(archive_path) context.archive_url = _archive_url(context, filename) diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 5c3a7f7b..138d043a 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -99,6 +99,10 @@ def _apply_context_substitutions(text: str, context) -> str: """Replace dynamic placeholders with values stored on *context*.""" if hasattr(context, "archive_sha256"): text = text.replace("", context.archive_sha256) + if hasattr(context, "archive_sha384"): + text = text.replace("", context.archive_sha384) + if hasattr(context, "archive_sha512"): + text = text.replace("", context.archive_sha512) if hasattr(context, "archive_url"): text = text.replace("", context.archive_url) return text diff --git a/features/steps/manifest_steps.py b/features/steps/manifest_steps.py index 66d834c1..e0861b4c 100644 --- a/features/steps/manifest_steps.py +++ b/features/steps/manifest_steps.py @@ -19,6 +19,10 @@ def apply_manifest_substitutions(context, contents: str) -> str: ) if hasattr(context, "archive_sha256"): result = result.replace("", context.archive_sha256) + if hasattr(context, "archive_sha384"): + result = result.replace("", context.archive_sha384) + if hasattr(context, "archive_sha512"): + result = result.replace("", context.archive_sha512) if hasattr(context, "archive_url"): result = result.replace("", context.archive_url) return result diff --git a/features/validate-manifest.feature b/features/validate-manifest.feature index 40358a66..b1e8a82f 100644 --- a/features/validate-manifest.feature +++ b/features/validate-manifest.feature @@ -51,19 +51,31 @@ Feature: Validate a manifest unexpected key not in schema 'manifest-wrong' """ - Scenario: A valid archive manifest with integrity hash is validated + Scenario: A valid archive manifest with integrity hashes is validated Given the manifest 'dfetch.yaml' """ manifest: version: '0.0' projects: - - name: SomeLib + - name: SomeLib-sha256 url: https://example.com/SomeLib-1.0.tar.gz vcs: archive integrity: hash: sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + - name: SomeLib-sha384 + url: https://example.com/SomeLib-2.0.tar.gz + vcs: archive + integrity: + hash: sha384:38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b + + - name: SomeLib-sha512 + url: https://example.com/SomeLib-3.0.tar.gz + vcs: archive + integrity: + hash: sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + """ When I run "dfetch validate" Then the output shows From 6924bd0602b061815f8e3e380167062b09b2575d Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 23:02:36 +0000 Subject: [PATCH 26/35] Update changelog --- CHANGELOG.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f71b9787..43ec044e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,11 @@ +Unreleased +========== + +* Add archive (``vcs: archive``) support for fetching dependencies from ``.tar.gz``, ``.tgz``, ``.tar.bz2``, ``.tar.xz`` and ``.zip`` files via HTTP, HTTPS or file URLs (#1058) +* Fix path-traversal check using character-based prefix comparison instead of path-component comparison (#1058) +* Fix directory hash being non-deterministic across filesystem traversal orders, causing false local-change detection (#1058) +* Fix ``dfetch freeze`` not capturing branch information for SVN projects when only the revision matched (#1058) + Release 0.12.1 (released 2026-02-24) ==================================== From 24c512e4675b951c65a45875cc451b78f029bccd Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 21 Mar 2026 23:27:11 +0000 Subject: [PATCH 27/35] Don't changing hashing algorithm --- dfetch/util/util.py | 19 +++++-------------- tests/test_util.py | 15 --------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/dfetch/util/util.py b/dfetch/util/util.py index cb37a39d..d8d86e4e 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -177,27 +177,18 @@ def find_file(name: str, path: str = ".") -> list[str]: def hash_directory(path: str, skiplist: list[str] | None) -> str: - """Hash a directory with all its files. - - Files are visited in a deterministic, sorted order so that the hash is - identical regardless of filesystem traversal order. The relative path of - each file (not just its basename) is included in the hash so that files - with the same name in different sub-directories are distinguished. - """ + """Hash a directory with all its files.""" digest = hashlib.md5(usedforsecurity=False) skiplist = skiplist or [] - for root, dirs, files in os.walk(path): - dirs.sort() # Ensure deterministic directory traversal order - for name in sorted(files): + for root, _, files in os.walk(path): + for name in files: if name not in skiplist: file_path = os.path.join(root, name) - rel_path = os.path.relpath(file_path, path) - # Hash the relative path to account for empty files/directories - # and to distinguish same-named files in different sub-directories + # Hash the path and add to the digest to account for empty files/directories digest.update( - hashlib.md5(rel_path.encode(), usedforsecurity=False).digest() + hashlib.md5(name.encode(), usedforsecurity=False).digest() ) digest = hash_file(file_path, digest) diff --git a/tests/test_util.py b/tests/test_util.py index 1411a2e0..351b31f5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -88,21 +88,6 @@ def test_hash_directory_differs_when_file_content_changes(tmp_path): assert h1 != h2 -def test_hash_directory_differs_for_same_name_in_different_subdirs(tmp_path): - """Files with identical names but in different sub-directories must affect the hash.""" - d1 = tmp_path / "proj1" - d1.mkdir() - (d1 / "a").mkdir() - (d1 / "a" / "file.txt").write_text("in a") - - d2 = tmp_path / "proj2" - d2.mkdir() - (d2 / "b").mkdir() - (d2 / "b" / "file.txt").write_text("in a") - - assert hash_directory(str(d1), None) != hash_directory(str(d2), None) - - def test_hash_directory_skiplist_excludes_file(tmp_path): """Files listed in skiplist must not contribute to the hash.""" d = tmp_path / "proj" From 6cdc76247256dfd44c5943a5a752e7438edb1cd6 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 07:49:07 +0000 Subject: [PATCH 28/35] Review comments --- dfetch/util/util.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/dfetch/util/util.py b/dfetch/util/util.py index d8d86e4e..498ab730 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -21,8 +21,12 @@ def is_license_file(filename: str) -> bool: return any(fnmatch.fnmatch(filename.lower(), pattern) for pattern in LICENSE_GLOBS) -def _copy_entry(src_entry: str, dest_entry: str) -> None: - """Copy a single file or directory *src_entry* to *dest_entry*.""" +def _copy_entry(src_entry: str, dest_entry: str, root: str) -> None: + """Copy a single file or directory *src_entry* to *dest_entry*. + + Raises :exc:`RuntimeError` if *src_entry* resolves outside *root*. + """ + check_no_path_traversal(src_entry, root) if os.path.isdir(src_entry): shutil.copytree(src_entry, dest_entry) else: @@ -35,9 +39,11 @@ def copy_directory_contents(src_dir: str, dest_dir: str) -> None: Directories are copied recursively; files are copied with metadata. """ for entry_name in os.listdir(src_dir): + src_path = os.path.join(src_dir, entry_name) _copy_entry( - os.path.join(src_dir, entry_name), + src_path, os.path.join(dest_dir, entry_name), + src_dir, ) @@ -70,6 +76,7 @@ def copy_src_subset( if keep_licenses: for entry_name in os.listdir(src_root): full_path = os.path.join(src_root, entry_name) + check_no_path_traversal(full_path, src_root) if os.path.isfile(full_path) and is_license_file(entry_name): shutil.copy2(full_path, os.path.join(dest_dir, entry_name)) @@ -79,7 +86,12 @@ def prune_files_by_pattern(directory: str, patterns: Sequence[str]) -> None: License files are never removed even when they match a pattern. """ + seen: set[str] = set() for file_or_dir in find_matching_files(directory, patterns): + resolved = str(file_or_dir.resolve()) + if resolved in seen: + continue + seen.add(resolved) if not (file_or_dir.is_file() and is_license_file(file_or_dir.name)): safe_rm(file_or_dir) From 6f7191a7b825e1af35dc9ff685f591fc594da15e Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 09:46:48 +0100 Subject: [PATCH 29/35] don't follow symlinks Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- dfetch/util/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfetch/util/util.py b/dfetch/util/util.py index 498ab730..d7537343 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -28,7 +28,7 @@ def _copy_entry(src_entry: str, dest_entry: str, root: str) -> None: """ check_no_path_traversal(src_entry, root) if os.path.isdir(src_entry): - shutil.copytree(src_entry, dest_entry) + shutil.copytree(src_entry, dest_entry, symlinks=True) else: shutil.copy2(src_entry, dest_entry) From 4aa19017f10ea2b694a9a6307b6f52d479a07af0 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 11:23:40 +0000 Subject: [PATCH 30/35] Review comments --- dfetch/project/archivesubproject.py | 18 ++++- dfetch/util/util.py | 11 ++- features/steps/archive_steps.py | 34 +++----- tests/test_archive.py | 118 +++++++++++++++++++++++++++- tests/test_util.py | 57 +++++++++++++- 5 files changed, 206 insertions(+), 32 deletions(-) diff --git a/dfetch/project/archivesubproject.py b/dfetch/project/archivesubproject.py index 26486a2c..bd702ebf 100644 --- a/dfetch/project/archivesubproject.py +++ b/dfetch/project/archivesubproject.py @@ -109,19 +109,28 @@ def _latest_revision_on_branch(self, branch: str) -> str: del branch return self.remote - def _download_and_compute_hash(self, algorithm: str = "sha256") -> IntegrityHash: + def _download_and_compute_hash( + self, algorithm: str = "sha256", url: str | None = None + ) -> IntegrityHash: """Download the archive to a temporary file and return its :class:`IntegrityHash`. The hash is computed during the download stream — no extra file read. The temporary file is always cleaned up, even on error. + Args: + algorithm: Hash algorithm to use (``sha256``, ``sha384``, ``sha512``). + url: If given, download from this URL instead of ``self._remote_repo``. + Use this to pin to the exact URL stored in the on-disk revision. + Raises: RuntimeError: On download failure or unsupported algorithm. """ - fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote)) + effective_url = url if url is not None else self.remote + remote = ArchiveRemote(effective_url) if url is not None else self._remote_repo + fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(effective_url)) os.close(fd) try: - hex_digest = self._remote_repo.download(tmp_path, algorithm=algorithm) + hex_digest = remote.download(tmp_path, algorithm=algorithm) return IntegrityHash(algorithm, hex_digest) finally: try: @@ -231,8 +240,9 @@ def freeze_project(self, project: ProjectEntry) -> str | None: revision = on_disk.revision # Already hash-pinned — use the on-disk revision directly. + # Otherwise download from the revision URL (not the possibly-updated manifest URL). pinned = IntegrityHash.parse(revision) or self._download_and_compute_hash( - "sha256" + "sha256", url=revision ) new_hash = str(pinned) if project.hash == new_hash: diff --git a/dfetch/util/util.py b/dfetch/util/util.py index d7537343..bca0935c 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -87,12 +87,21 @@ def prune_files_by_pattern(directory: str, patterns: Sequence[str]) -> None: License files are never removed even when they match a pattern. """ seen: set[str] = set() + paths = [] for file_or_dir in find_matching_files(directory, patterns): resolved = str(file_or_dir.resolve()) if resolved in seen: continue seen.add(resolved) - if not (file_or_dir.is_file() and is_license_file(file_or_dir.name)): + paths.append(file_or_dir) + + # Remove children before parents to avoid FileNotFoundError on already-deleted paths. + paths.sort(key=lambda p: len(str(p.resolve())), reverse=True) + + for file_or_dir in paths: + if file_or_dir.exists() and not ( + file_or_dir.is_file() and is_license_file(file_or_dir.name) + ): safe_rm(file_or_dir) diff --git a/features/steps/archive_steps.py b/features/steps/archive_steps.py index 7b0390aa..1c961035 100644 --- a/features/steps/archive_steps.py +++ b/features/steps/archive_steps.py @@ -13,27 +13,9 @@ from behave import given # pylint: disable=no-name-in-module -def _sha256(path: str) -> str: - """Return the SHA-256 hex digest of a file.""" - h = hashlib.sha256() - with open(path, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): - h.update(chunk) - return h.hexdigest() - - -def _sha384(path: str) -> str: - """Return the SHA-384 hex digest of a file.""" - h = hashlib.sha384() - with open(path, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): - h.update(chunk) - return h.hexdigest() - - -def _sha512(path: str) -> str: - """Return the SHA-512 hex digest of a file.""" - h = hashlib.sha512() +def _file_digest(path: str, constructor) -> str: + """Return the hex digest of *path* using the given hashlib *constructor*.""" + h = constructor() with open(path, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): h.update(chunk) @@ -85,12 +67,14 @@ def _create_archive(context, name: str, extension: str) -> None: if extension == ".tar.gz": create_tar_gz(archive_path, name, files) - else: + elif extension == ".zip": create_zip(archive_path, name, files) + else: + raise ValueError(f"Unsupported archive extension: {extension!r}") - context.archive_sha256 = _sha256(archive_path) - context.archive_sha384 = _sha384(archive_path) - context.archive_sha512 = _sha512(archive_path) + context.archive_sha256 = _file_digest(archive_path, hashlib.sha256) + context.archive_sha384 = _file_digest(archive_path, hashlib.sha384) + context.archive_sha512 = _file_digest(archive_path, hashlib.sha512) context.archive_url = _archive_url(context, filename) diff --git a/tests/test_archive.py b/tests/test_archive.py index dedc422f..6e9a8c45 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -1,14 +1,18 @@ """Unit tests for dfetch.vcs.archive and dfetch.project.archivesubproject.""" +import hashlib import io import os import tarfile import tempfile import zipfile +from unittest.mock import patch import pytest -from dfetch.project.archivesubproject import _suffix_for_url +from dfetch.manifest.project import ProjectEntry +from dfetch.manifest.version import Version +from dfetch.project.archivesubproject import ArchiveSubProject, _suffix_for_url from dfetch.vcs.archive import ( ARCHIVE_EXTENSIONS, ArchiveLocalRepo, @@ -249,3 +253,115 @@ def test_all_archive_extensions_covered(): assert len(ARCHIVE_EXTENSIONS) > 0 for ext in ARCHIVE_EXTENSIONS: assert ext.startswith(".") + + +# --------------------------------------------------------------------------- +# Helpers shared by ArchiveSubProject tests +# --------------------------------------------------------------------------- + + +def _make_tar_gz(path: str, content: bytes = b"hello") -> None: + """Write a minimal .tar.gz archive containing one file to *path*.""" + with tarfile.open(path, "w:gz") as tf: + info = tarfile.TarInfo(name="pkg/README.md") + info.size = len(content) + tf.addfile(info, io.BytesIO(content)) + + +def _sha256_file(path: str) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + return h.hexdigest() + + +def _file_url(path: str) -> str: + return "file:///" + path.lstrip("/") + + +def _make_subproject(url: str) -> ArchiveSubProject: + return ArchiveSubProject( + ProjectEntry({"name": "pkg", "url": url, "vcs": "archive"}) + ) + + +# --------------------------------------------------------------------------- +# ArchiveSubProject._download_and_compute_hash – explicit url parameter +# --------------------------------------------------------------------------- + + +def test_download_and_compute_hash_default_uses_remote_repo(): + """Without an explicit url the hash is computed from self._remote_repo.""" + with tempfile.TemporaryDirectory() as tmp: + archive = os.path.join(tmp, "pkg.tar.gz") + _make_tar_gz(archive) + url = _file_url(archive) + sp = _make_subproject(url) + + result = sp._download_and_compute_hash("sha256") + + assert result.algorithm == "sha256" + assert result.hex_digest == _sha256_file(archive) + + +def test_download_and_compute_hash_explicit_url_overrides_remote_repo(): + """When *url* is supplied a fresh ArchiveRemote for that URL is used. + + This is the regression guard for the fix: if the manifest URL was changed + after fetching, freeze must still hash the *original* archive (the one + recorded in the on-disk revision), not the current manifest URL. + """ + with tempfile.TemporaryDirectory() as tmp: + archive_a = os.path.join(tmp, "pkg_a.tar.gz") + archive_b = os.path.join(tmp, "pkg_b.tar.gz") + _make_tar_gz(archive_a, content=b"version A") + _make_tar_gz(archive_b, content=b"version B") + url_a = _file_url(archive_a) + url_b = _file_url(archive_b) + + # SubProject points to archive_b (current manifest URL). + sp = _make_subproject(url_b) + + # Passing url=url_a must use archive_a's content. + result = sp._download_and_compute_hash("sha256", url=url_a) + + assert result.hex_digest == _sha256_file(archive_a) + assert result.hex_digest != _sha256_file(archive_b) + + +# --------------------------------------------------------------------------- +# ArchiveSubProject.freeze_project – uses on-disk revision URL +# --------------------------------------------------------------------------- + + +def test_freeze_project_uses_on_disk_url_not_manifest_url(): + """freeze_project must hash the archive at the on-disk revision URL. + + Scenario: the manifest URL was updated after the last fetch. Without the + fix, freeze would download from the new (current) manifest URL and produce + a hash that doesn't match the fetched archive. With the fix it uses the + URL stored in the on-disk revision. + """ + with tempfile.TemporaryDirectory() as tmp: + archive_a = os.path.join(tmp, "pkg_a.tar.gz") + archive_b = os.path.join(tmp, "pkg_b.tar.gz") + _make_tar_gz(archive_a, content=b"original fetch") + _make_tar_gz(archive_b, content=b"updated manifest url") + url_a = _file_url(archive_a) + url_b = _file_url(archive_b) + + # SubProject now points to archive_b (manifest was updated after fetch). + sp = _make_subproject(url_b) + + # Simulate on-disk state: was fetched from url_a (no hash-pin at the time). + on_disk = Version(revision=url_a) + with patch.object(sp, "on_disk_version", return_value=on_disk): + project_entry = ProjectEntry( + {"name": "pkg", "url": url_b, "vcs": "archive"} + ) + sp.freeze_project(project_entry) + + expected_hash = f"sha256:{_sha256_file(archive_a)}" + assert project_entry.hash == expected_hash + assert _sha256_file(archive_b) not in project_entry.hash diff --git a/tests/test_util.py b/tests/test_util.py index 351b31f5..5e7010c3 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -5,7 +5,7 @@ import pytest -from dfetch.util.util import copy_src_subset, hash_directory +from dfetch.util.util import copy_src_subset, hash_directory, prune_files_by_pattern # --------------------------------------------------------------------------- # copy_src_subset – path-traversal protection @@ -100,3 +100,58 @@ def test_hash_directory_skiplist_excludes_file(tmp_path): h_with_skip2 = hash_directory(str(d), ["ignored.txt"]) assert h_with_skip == h_with_skip2 + + +# --------------------------------------------------------------------------- +# prune_files_by_pattern – delete-order safety +# --------------------------------------------------------------------------- + + +def test_prune_removes_matched_file(tmp_path): + (tmp_path / "remove_me.txt").write_text("gone") + prune_files_by_pattern(str(tmp_path), ["remove_me.txt"]) + assert not (tmp_path / "remove_me.txt").exists() + + +def test_prune_parent_and_child_both_matched_no_error(tmp_path): + """When a dir and a file inside it both match, removal must not raise. + + Before the fix, removing the parent first left the child path pointing at a + non-existent location; the subsequent safe_rm call then raised + FileNotFoundError. + """ + src = tmp_path / "src" + src.mkdir() + (src / "main.c").write_text("int main(){}") + + # "src" matches the directory; "main.c" matches the child inside it. + prune_files_by_pattern(str(tmp_path), ["src", "main.c"]) + + assert not src.exists() + + +def test_prune_preserves_license_file(tmp_path): + """License files must survive even when they match a removal pattern.""" + (tmp_path / "LICENSE").write_text("MIT") + (tmp_path / "delete_me.txt").write_text("gone") + + prune_files_by_pattern(str(tmp_path), ["LICENSE", "delete_me.txt"]) + + assert (tmp_path / "LICENSE").exists() + assert not (tmp_path / "delete_me.txt").exists() + + +def test_prune_skips_already_removed_paths(tmp_path): + """Paths that no longer exist after a parent removal are silently skipped.""" + parent = tmp_path / "libs" + parent.mkdir() + child = parent / "lib.a" + child.write_text("binary") + unrelated = tmp_path / "readme.txt" + unrelated.write_text("keep") + + # Both "libs" (directory) and "libs/lib.a" (child) match; no exception expected. + prune_files_by_pattern(str(tmp_path), ["libs", "lib.a"]) + + assert not parent.exists() + assert unrelated.exists() From a2a464ce11b6e88b3603d2cc411dbd3f119cbac7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 22 Mar 2026 12:57:39 +0000 Subject: [PATCH 31/35] Fix CodeRabbitAI review comments: security, robustness, and platform fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security improvements: - archive.py: Add `_check_tar_member_type()` to validate TAR members on ALL Python versions (not just 3.11.4+). Rejects symlinks with absolute/escaping targets, hardlinks with path-traversal targets, device files (chr/blk), and FIFOs. The `filter="tar"` on Python ≥ 3.11.4 now acts as defence-in-depth rather than the sole guard. Bug fixes: - log.py: Escape user-supplied strings (project names, info messages) with `rich.markup.escape()` before embedding them in Rich markup templates. Prevents `rich.errors.MarkupError` when project or remote names contain Rich markup syntax (e.g. `[/`). Fixes pre-existing crash in fuzzing tests. Platform / correctness fixes: - test_archive.py: Use `pathlib.Path.as_uri()` for RFC-compliant file URIs instead of manual string manipulation that is not portable on Windows. - generic_steps.py: Use `pathlib.Path.as_posix()` in `remote_server_path()` instead of `os.sep`-based split/join, which could fail with mixed path separators on Windows. Tests: - Add 12 new unit tests for `_check_tar_member_type()` covering safe symlinks, absolute symlinks, `..` symlinks, safe/escaping hardlinks, char/block device files, FIFOs, and integration with `_check_tar_members`. https://claude.ai/code/session_0169NNA4kGKpDkZVdLQE7VXn --- dfetch/log.py | 28 ++++--- dfetch/vcs/archive.py | 77 +++++++++++++++--- features/steps/generic_steps.py | 4 +- tests/test_archive.py | 135 +++++++++++++++++++++++++++++++- 4 files changed, 217 insertions(+), 27 deletions(-) diff --git a/dfetch/log.py b/dfetch/log.py index 52476ffa..fbdc7a32 100644 --- a/dfetch/log.py +++ b/dfetch/log.py @@ -9,6 +9,7 @@ from rich.console import Console from rich.highlighter import NullHighlighter from rich.logging import RichHandler +from rich.markup import escape as markup_escape from rich.status import Status from dfetch import __version__ @@ -52,26 +53,30 @@ class DLogger(logging.Logger): def print_report_line(self, name: str, info: str) -> None: """Print a line for a report.""" + safe_name = markup_escape(name) + safe_info = markup_escape(info) self.info( - f" [bold][bright_green]{name:20s}:[/bright_green][blue] {info}[/blue][/bold]" + f" [bold][bright_green]{safe_name:20s}:[/bright_green][blue] {safe_info}[/blue][/bold]" ) def print_info_line(self, name: str, info: str) -> None: """Print a line of info, only printing the project name once.""" if name not in DLogger._printed_projects: - self.info(f" [bold][bright_green]{name}:[/bright_green][/bold]") + safe_name = markup_escape(name) + self.info(f" [bold][bright_green]{safe_name}:[/bright_green][/bold]") DLogger._printed_projects.add(name) - line = info.replace("\n", "\n ") + line = markup_escape(info).replace("\n", "\n ") self.info(f" [bold blue]> {line}[/bold blue]") def print_warning_line(self, name: str, info: str) -> None: """Print a warning line: green name, yellow value.""" if name not in DLogger._printed_projects: - self.info(f" [bold][bright_green]{name}:[/bright_green][/bold]") + safe_name = markup_escape(name) + self.info(f" [bold][bright_green]{safe_name}:[/bright_green][/bold]") DLogger._printed_projects.add(name) - line = info.replace("\n", "\n ") + line = markup_escape(info).replace("\n", "\n ") self.info(f" [bold bright_yellow]> {line}[/bold bright_yellow]") def print_title(self) -> None: @@ -85,12 +90,14 @@ def print_info_field(self, field_name: str, field: str) -> None: def warning(self, msg: object, *args: Any, **kwargs: Any) -> None: """Log warning.""" super().warning( - f"[bold bright_yellow]{msg}[/bold bright_yellow]", *args, **kwargs + f"[bold bright_yellow]{markup_escape(str(msg))}[/bold bright_yellow]", + *args, + **kwargs, ) def error(self, msg: object, *args: Any, **kwargs: Any) -> None: """Log error.""" - super().error(f"[red]{msg}[/red]", *args, **kwargs) + super().error(f"[red]{markup_escape(str(msg))}[/red]", *args, **kwargs) def status( self, name: str, message: str, spinner: str = "dots", enabled: bool = True @@ -111,11 +118,12 @@ def status( return nullcontext(None) if name not in DLogger._printed_projects: - self.info(f" [bold][bright_green]{name}:[/bright_green][/bold]") + safe_name = markup_escape(name) + self.info(f" [bold][bright_green]{safe_name}:[/bright_green][/bold]") DLogger._printed_projects.add(name) return Status( - f"[bold bright_blue]> {message}[/bold bright_blue]", + f"[bold bright_blue]> {markup_escape(message)}[/bold bright_blue]", spinner=spinner, console=rich_console, ) @@ -138,7 +146,7 @@ def filter(self, record: logging.LogRecord) -> bool: """Add indentation to the log record message.""" color = "blue" if record.levelno < logging.WARNING else "yellow" - line = record.msg.replace("\n", "\n ") + line = markup_escape(str(record.msg)).replace("\n", "\n ") record.msg = f"{self.prefix}[{color}]{line}[/{color}]" return True diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index b1c2f665..52265947 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -312,18 +312,68 @@ def check_zip_members(zf: zipfile.ZipFile) -> list[zipfile.ZipInfo]: ArchiveLocalRepo._check_archive_member_path(info.filename) return members + @staticmethod + def _check_tar_member_type(member: tarfile.TarInfo) -> None: + """Reject dangerous TAR member types that could harm the host system. + + On Python ≥ 3.11.4 the ``filter="tar"`` passed to + :meth:`tarfile.TarFile.extractall` already blocks many of these, but + we validate here too so the guard is active on **all** supported Python + versions and provides defence-in-depth on newer ones. + + Rejected member types: + + * **Symlinks with absolute or escaping targets** — could create a + foothold outside the extraction directory for later writes. + * **Hard links with absolute or escaping targets** — same risk as + dangerous symlinks; the target path is validated like a regular + member name. + * **Device files** (character, block) — accessing ``/dev/mem`` or + similar via an extracted device node can compromise the host. + * **FIFO / named pipes** — rarely present in software archives and + can be used to communicate with host processes or block extraction. + + Raises: + RuntimeError: When *member* is a disallowed or unsafe member type. + """ + if member.issym(): + target = member.linkname + if os.path.isabs(target) or any( + part == ".." for part in pathlib.PurePosixPath(target).parts + ): + raise RuntimeError( + f"Archive contains a symlink with an unsafe target: " + f"{member.name!r} -> {target!r}" + ) + elif member.islnk(): + # Hard-link targets are archive-relative paths; apply the same + # path-traversal check as we do for regular member names. + ArchiveLocalRepo._check_archive_member_path(member.linkname) + elif member.isdev() or member.isfifo(): + raise RuntimeError( + f"Archive contains a special file (device/FIFO): {member.name!r}" + ) + @staticmethod def _check_tar_members(tf: tarfile.TarFile) -> None: - """Validate TAR members against decompression bombs and path traversal. + """Validate TAR members against decompression bombs and unsafe member types. + + Checks applied (all supported Python versions): + + * **Size / count limits** — guard against decompression-bomb archives. + * **Path traversal** — reject absolute paths and ``..`` components. + * **Unsafe member types** — reject symlinks with absolute or escaping + targets, hardlinks with escaping targets, device files, and FIFOs + (see :meth:`_check_tar_member_type`). - Size/count limits mirror :meth:`check_zip_members`. Path validation - is defence-in-depth: on Python ≥ 3.11.4 the ``filter="tar"`` passed to - :meth:`tarfile.TarFile.extractall` also rejects unsafe paths, but we - check here too so the guard applies on all supported Python versions. + On Python ≥ 3.11.4 the ``filter="tar"`` passed to + :meth:`tarfile.TarFile.extractall` provides additional OS-level + protection; these checks remain as defence-in-depth. Raises: - RuntimeError: When the archive exceeds the size/count limits or - contains an absolute path or ``..`` component. + RuntimeError: When the archive exceeds the size/count limits, + contains an absolute path or ``..`` component, or contains an + unsafe member type (dangerous symlink, device file, FIFO). """ members = tf.getmembers() ArchiveLocalRepo._check_archive_limits( @@ -331,6 +381,7 @@ def _check_tar_members(tf: tarfile.TarFile) -> None: ) for member in members: ArchiveLocalRepo._check_archive_member_path(member.name) + ArchiveLocalRepo._check_tar_member_type(member) @staticmethod def _extract_raw(archive_path: str, dest_dir: str) -> None: @@ -338,12 +389,12 @@ def _extract_raw(archive_path: str, dest_dir: str) -> None: Safety checks performed before extraction: - * TAR: member count and total uncompressed size (decompression bomb). - Path sanitisation uses the built-in ``filter="tar"`` filter when - available (Python ≥ 3.11.4 / 3.12), which rejects absolute paths, - ``..`` components, absolute symlinks, and device files. On older - Python releases extraction proceeds without the filter (member-path - attacks are still blocked by ``_check_tar_members``). + * TAR: :meth:`_check_tar_members` validates every member for + decompression-bomb limits, path traversal, dangerous symlink + targets, hardlink targets, device files, and FIFOs on **all** + supported Python versions. When Python ≥ 3.11.4 is available the + built-in ``filter="tar"`` provides additional OS-level enforcement + as defence-in-depth. * ZIP: member path traversal validation (absolute paths and ``..`` components are rejected) plus member count and size limits. """ diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 138d043a..2e0e7d35 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -50,8 +50,8 @@ def temporary_env(key: str, value: str): def remote_server_path(context): - """Get the path to the remote dir.""" - return "/".join(context.remotes_dir_path.split(os.sep)) + """Get the path to the remote dir as a POSIX path string.""" + return pathlib.Path(context.remotes_dir_path).as_posix() def call_command(context: Context, args: list[str], path: Optional[str] = ".") -> None: diff --git a/tests/test_archive.py b/tests/test_archive.py index 6e9a8c45..47890001 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -3,6 +3,7 @@ import hashlib import io import os +import pathlib import tarfile import tempfile import zipfile @@ -24,6 +25,7 @@ _check_archive_limits = ArchiveLocalRepo._check_archive_limits _check_zip_members = ArchiveLocalRepo.check_zip_members _check_tar_members = ArchiveLocalRepo._check_tar_members +_check_tar_member_type = ArchiveLocalRepo._check_tar_member_type # --------------------------------------------------------------------------- @@ -168,6 +170,135 @@ def test_check_tar_members_absolute(): _check_tar_members(tf) +def _make_tar_with_member(setup_fn) -> tarfile.TarFile: + """Create an in-memory tar whose members are set up by *setup_fn(tf)*.""" + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:") as tf: + setup_fn(tf) + buf.seek(0) + return tarfile.open(fileobj=buf, mode="r:") + + +def _add_symlink(tf: tarfile.TarFile, name: str, target: str) -> None: + info = tarfile.TarInfo(name=name) + info.type = tarfile.SYMTYPE + info.linkname = target + tf.addfile(info) + + +def _add_hardlink(tf: tarfile.TarFile, name: str, target: str) -> None: + info = tarfile.TarInfo(name=name) + info.type = tarfile.LNKTYPE + info.linkname = target + tf.addfile(info) + + +def _add_chrdev(tf: tarfile.TarFile, name: str) -> None: + info = tarfile.TarInfo(name=name) + info.type = tarfile.CHRTYPE + tf.addfile(info) + + +def _add_blkdev(tf: tarfile.TarFile, name: str) -> None: + info = tarfile.TarInfo(name=name) + info.type = tarfile.BLKTYPE + tf.addfile(info) + + +def _add_fifo(tf: tarfile.TarFile, name: str) -> None: + info = tarfile.TarInfo(name=name) + info.type = tarfile.FIFOTYPE + tf.addfile(info) + + +# --------------------------------------------------------------------------- +# _check_tar_member_type — symlink validation +# --------------------------------------------------------------------------- + + +def test_check_tar_member_type_safe_symlink(): + tf = _make_tar_with_member(lambda t: _add_symlink(t, "link", "relative/target")) + member = tf.getmembers()[0] + _check_tar_member_type(member) # should not raise + + +def test_check_tar_member_type_absolute_symlink(): + tf = _make_tar_with_member(lambda t: _add_symlink(t, "link", "/etc/passwd")) + member = tf.getmembers()[0] + with pytest.raises(RuntimeError, match="unsafe target"): + _check_tar_member_type(member) + + +def test_check_tar_member_type_dotdot_symlink(): + tf = _make_tar_with_member(lambda t: _add_symlink(t, "link", "../../etc/passwd")) + member = tf.getmembers()[0] + with pytest.raises(RuntimeError, match="unsafe target"): + _check_tar_member_type(member) + + +# --------------------------------------------------------------------------- +# _check_tar_member_type — hardlink validation +# --------------------------------------------------------------------------- + + +def test_check_tar_member_type_safe_hardlink(): + tf = _make_tar_with_member(lambda t: _add_hardlink(t, "hardlink", "project/real.c")) + member = tf.getmembers()[0] + _check_tar_member_type(member) # should not raise + + +def test_check_tar_member_type_dotdot_hardlink(): + tf = _make_tar_with_member( + lambda t: _add_hardlink(t, "hardlink", "../outside/secret.txt") + ) + member = tf.getmembers()[0] + with pytest.raises(RuntimeError, match="unsafe member path"): + _check_tar_member_type(member) + + +# --------------------------------------------------------------------------- +# _check_tar_member_type — device / FIFO validation +# --------------------------------------------------------------------------- + + +def test_check_tar_member_type_char_device(): + tf = _make_tar_with_member(lambda t: _add_chrdev(t, "dev/mem")) + member = tf.getmembers()[0] + with pytest.raises(RuntimeError, match="special file"): + _check_tar_member_type(member) + + +def test_check_tar_member_type_block_device(): + tf = _make_tar_with_member(lambda t: _add_blkdev(t, "dev/sda")) + member = tf.getmembers()[0] + with pytest.raises(RuntimeError, match="special file"): + _check_tar_member_type(member) + + +def test_check_tar_member_type_fifo(): + tf = _make_tar_with_member(lambda t: _add_fifo(t, "named_pipe")) + member = tf.getmembers()[0] + with pytest.raises(RuntimeError, match="special file"): + _check_tar_member_type(member) + + +# --------------------------------------------------------------------------- +# _check_tar_members — integration of member-type validation +# --------------------------------------------------------------------------- + + +def test_check_tar_members_rejects_absolute_symlink(): + tf = _make_tar_with_member(lambda t: _add_symlink(t, "link", "/etc/passwd")) + with pytest.raises(RuntimeError, match="unsafe target"): + _check_tar_members(tf) + + +def test_check_tar_members_rejects_device_file(): + tf = _make_tar_with_member(lambda t: _add_chrdev(t, "dev/mem")) + with pytest.raises(RuntimeError, match="special file"): + _check_tar_members(tf) + + # --------------------------------------------------------------------------- # ArchiveRemote.is_accessible # --------------------------------------------------------------------------- @@ -177,7 +308,7 @@ def test_is_accessible_existing_file(): with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as f: path = f.name try: - url = f"file:///{path.lstrip('/')}" + url = pathlib.Path(path).as_uri() remote = ArchiveRemote(url) assert remote.is_accessible() is True finally: @@ -277,7 +408,7 @@ def _sha256_file(path: str) -> str: def _file_url(path: str) -> str: - return "file:///" + path.lstrip("/") + return pathlib.Path(path).as_uri() def _make_subproject(url: str) -> ArchiveSubProject: From 501e3d98105a56dc73158a687ce89c9cada049ca Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 19:20:07 +0000 Subject: [PATCH 32/35] Review comments --- dfetch/log.py | 16 ++++++++ dfetch/reporting/sbom_reporter.py | 23 ++++++++--- dfetch/util/purl.py | 64 +++++-------------------------- dfetch/util/util.py | 10 ++--- dfetch/vcs/archive.py | 50 ++++++++++++++++++++---- doc/landing-page/conf.py | 10 ++++- features/steps/generic_steps.py | 26 ++++++++++--- features/steps/manifest_steps.py | 18 ++++----- tests/test_purl.py | 9 +++-- 9 files changed, 133 insertions(+), 93 deletions(-) diff --git a/dfetch/log.py b/dfetch/log.py index fbdc7a32..0eb9b402 100644 --- a/dfetch/log.py +++ b/dfetch/log.py @@ -3,6 +3,7 @@ import logging import os import sys +import types from contextlib import nullcontext from typing import Any, cast @@ -194,7 +195,22 @@ def get_logger(name: str, console: Console | None = None) -> DLogger: def configure_external_logger(name: str, level: int = logging.INFO) -> None: """Configure an external logger from a third party package.""" logger = logging.getLogger(name) + # Ensure the external logger is a plain Logger so its log methods do not + # wrap messages in Rich markup (which DLogger.warning / DLogger.error do). + # Without this, markup_escape in ExtLogFilter would turn those Rich tags + # into literal text that shifts tab-stop calculations when rendered. + logger.__class__ = logging.Logger logger.setLevel(level) logger.propagate = True logger.handlers.clear() logger.addFilter(ExtLogFilter()) + # Some packages (e.g. patch_ng) cache logger bound-methods as module-level + # names at import time (e.g. `warning = logger.warning`). After the + # __class__ reassignment above those cached references still point at the + # old DLogger method, so re-bind them to the freshly demoted logger. + module = sys.modules.get(name.split(".")[0]) + if module is not None: + for method_name in ("debug", "info", "warning", "error", "critical"): + attr = getattr(module, method_name, None) + if isinstance(attr, types.MethodType) and attr.__self__ is logger: + setattr(module, method_name, getattr(logger, method_name)) diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 9df6dc53..0b94461a 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -109,12 +109,13 @@ from cyclonedx.schema import OutputFormat, SchemaVersion from packageurl import PackageURL -import dfetch.util.purl +import dfetch from dfetch.manifest.manifest import Manifest from dfetch.manifest.project import ProjectEntry from dfetch.reporting.reporter import Reporter from dfetch.util.license import License -from dfetch.util.purl import DFETCH_TO_CDX_HASH_ALGORITHM +from dfetch.util.purl import vcs_url_to_purl +from dfetch.vcs.archive import archive_url_to_purl from dfetch.vcs.integrity_hash import IntegrityHash # PyRight is pedantic with decorators see https://github.com/madpah/serializable/issues/8 @@ -123,6 +124,14 @@ # pyright: reportCallIssue=false, reportAttributeAccessIssue=false +# Map from dfetch hash-field algorithm prefix to CycloneDX HashAlgorithm name +DFETCH_TO_CDX_HASH_ALGORITHM: dict[str, str] = { + "sha256": "SHA-256", + "sha384": "SHA-384", + "sha512": "SHA-512", +} + + class SbomReporter(Reporter): """Reporter for generating SBoM's.""" @@ -189,9 +198,13 @@ def add_project( version: str, ) -> None: """Add a project to the report.""" - purl = dfetch.util.purl.remote_url_to_purl( - project.remote_url, version=version, subpath=project.source or None - ) + subpath = project.source or None + if project.vcs == "archive": + purl = archive_url_to_purl( + project.remote_url, version=version, subpath=subpath + ) + else: + purl = vcs_url_to_purl(project.remote_url, version=version, subpath=subpath) name = project.name if purl.type == "generic" else purl.name location = self.manifest.find_name_in_manifest(project.name) component = Component( diff --git a/dfetch/util/purl.py b/dfetch/util/purl.py index d68ab5e1..294d8a5a 100644 --- a/dfetch/util/purl.py +++ b/dfetch/util/purl.py @@ -1,17 +1,14 @@ -"""Module to convert remote URLs to valid Package URLs (PURLs). +"""Module to convert VCS remote URLs to valid Package URLs (PURLs). -Supports: GitHub, Bitbucket, SVN, SSH paths, archives, and more. +Supports: GitHub, Bitbucket, SVN, SSH paths, and generic VCS URLs. """ -import os.path import re from urllib.parse import urlparse from packageurl import PackageURL from tldextract import TLDExtract -from dfetch.vcs.archive import ARCHIVE_EXTENSIONS - # Although tldextract can fetch the latest suffix list, we don't want that here NO_FETCH_EXTRACT = TLDExtract(suffix_list_urls=(), extra_suffixes=("local",)) @@ -38,33 +35,10 @@ # These domains have no specific Purl type, but adding the domain to the purl doesn't add any value EXCLUDED_DOMAINS = ["gitlab", "gitea", "gitee", "sf", "gnu"] -# Map from dfetch hash-field algorithm prefix to CycloneDX HashAlgorithm name -DFETCH_TO_CDX_HASH_ALGORITHM: dict[str, str] = { - "sha256": "SHA-256", - "sha384": "SHA-384", - "sha512": "SHA-512", -} - # Name given to a package or group if it is not extractable from the URL DEFAULT_NAME = "unknown" -def _is_archive_url(url: str) -> bool: - """Return *True* when *url* points to a recognised archive file.""" - lower = url.lower().split("?")[0] # strip query string before checking extension - return any(lower.endswith(ext) for ext in ARCHIVE_EXTENSIONS) - - -def _strip_archive_extension(name: str) -> str: - """Remove a recognised archive extension from *name*.""" - lower = name.lower() - # Check multi-part extensions first (.tar.gz etc.) - for ext in ARCHIVE_EXTENSIONS: - if lower.endswith(ext): - return name[: -len(ext)] - return name - - def _namespace_and_name_from_domain_and_path(domain: str, path: str) -> tuple[str, str]: """Split the full path to a name and namespace.""" domain = NO_FETCH_EXTRACT(domain).domain @@ -103,24 +77,6 @@ def _known_purl_types( return None -def _archive_purl( - remote_url: str, version: str | None, subpath: str | None -) -> PackageURL: - """Build a generic PURL for an archive URL.""" - parsed = urlparse(remote_url) - basename = os.path.basename(parsed.path) - name = _strip_archive_extension(basename) or DEFAULT_NAME - namespace = parsed.hostname or "" - return PackageURL( - type="generic", - namespace=namespace or None, - name=name, - version=version, - qualifiers={"download_url": remote_url}, - subpath=subpath, - ) - - def _vcs_namespace_and_name(remote_url: str) -> tuple[str, str, str]: """Derive namespace, name, and normalised URL for a generic VCS remote URL. @@ -150,25 +106,23 @@ def _vcs_namespace_and_name(remote_url: str) -> tuple[str, str, str]: return namespace, name, remote_url -def remote_url_to_purl( - remote_url: str, version: str | None = None, subpath: str | None = None +def vcs_url_to_purl( + vcs_url: str, version: str | None = None, subpath: str | None = None ) -> PackageURL: - """Convert a remote URL to a valid PackageURL object. + """Convert a VCS remote URL to a valid PackageURL object. - Supports GitHub, Bitbucket, SVN, SSH paths, and archive downloads. + Supports GitHub, Bitbucket, SVN, SSH paths, and generic VCS URLs. Optionally specify version and subpath. """ - purl = _known_purl_types(remote_url, version, subpath) + purl = _known_purl_types(vcs_url, version, subpath) if purl: return purl - if _is_archive_url(remote_url): - return _archive_purl(remote_url, version, subpath) - namespace, name, remote_url = _vcs_namespace_and_name(remote_url) + namespace, name, vcs_url = _vcs_namespace_and_name(vcs_url) return PackageURL( type="generic", namespace=namespace, name=name, version=version, - qualifiers={"vcs_url": remote_url}, + qualifiers={"vcs_url": vcs_url}, subpath=subpath, ) diff --git a/dfetch/util/util.py b/dfetch/util/util.py index bca0935c..24fdeceb 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -89,17 +89,17 @@ def prune_files_by_pattern(directory: str, patterns: Sequence[str]) -> None: seen: set[str] = set() paths = [] for file_or_dir in find_matching_files(directory, patterns): - resolved = str(file_or_dir.resolve()) - if resolved in seen: + path_str = str(file_or_dir) + if path_str in seen: continue - seen.add(resolved) + seen.add(path_str) paths.append(file_or_dir) # Remove children before parents to avoid FileNotFoundError on already-deleted paths. - paths.sort(key=lambda p: len(str(p.resolve())), reverse=True) + paths.sort(key=lambda p: len(str(p)), reverse=True) for file_or_dir in paths: - if file_or_dir.exists() and not ( + if os.path.lexists(str(file_or_dir)) and not ( file_or_dir.is_file() and is_license_file(file_or_dir.name) ): safe_rm(file_or_dir) diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 52265947..8d1002fa 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -36,6 +36,8 @@ from collections.abc import Sequence from typing import overload +from packageurl import PackageURL + from dfetch.log import get_logger from dfetch.util.util import ( copy_directory_contents, @@ -43,16 +45,55 @@ prune_files_by_pattern, ) +logger = get_logger(__name__) + #: Archive file extensions recognised by DFetch. ARCHIVE_EXTENSIONS = (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".zip") -logger = get_logger(__name__) - # Safety limits applied during extraction to prevent decompression bombs. _MAX_UNCOMPRESSED_BYTES = 500 * 1024 * 1024 # 500 MB _MAX_MEMBER_COUNT = 10_000 +def is_archive_url(url: str) -> bool: + """Return *True* when *url* ends with a recognised archive extension. + + Query strings and fragments are stripped before testing so that URLs like + ``https://example.com/pkg.tar.gz?download=1`` are correctly recognised. + """ + path = urllib.parse.urlparse(url).path + return any(path.lower().endswith(ext) for ext in ARCHIVE_EXTENSIONS) + + +def strip_archive_extension(name: str) -> str: + """Remove a recognised archive extension from *name*.""" + lower = name.lower() + for ext in ARCHIVE_EXTENSIONS: + if lower.endswith(ext): + return name[: -len(ext)] + return name + + +def archive_url_to_purl( + download_url: str, + version: str | None = None, + subpath: str | None = None, +) -> PackageURL: + """Build a generic PackageURL for an archive download URL.""" + parsed = urllib.parse.urlparse(download_url) + basename = os.path.basename(parsed.path) + name = strip_archive_extension(basename) or "unknown" + namespace = parsed.hostname or "" + return PackageURL( + type="generic", + namespace=namespace or None, + name=name, + version=version, + qualifiers={"download_url": download_url}, + subpath=subpath, + ) + + def _http_conn(scheme: str, netloc: str, timeout: int) -> http.client.HTTPConnection: """Return an :class:`http.client.HTTPConnection` or HTTPS variant for *netloc*.""" if scheme == "https": @@ -66,11 +107,6 @@ def _resource_path(parsed: urllib.parse.ParseResult) -> str: return f"{path}?{parsed.query}" if parsed.query else path -def is_archive_url(url: str) -> bool: - """Return *True* when *url* ends with a recognised archive extension.""" - return any(url.lower().endswith(ext) for ext in ARCHIVE_EXTENSIONS) - - class ArchiveRemote: """Represents a remote archive (tar/zip) URL. diff --git a/doc/landing-page/conf.py b/doc/landing-page/conf.py index ee3a7cb0..e5762639 100644 --- a/doc/landing-page/conf.py +++ b/doc/landing-page/conf.py @@ -95,11 +95,17 @@ html_css_files = [ ( "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/brands.min.css", - {"integrity": "sha512-8RxmFOVaKQe/xtg6lbscU9DU0IRhURWEuiI0tXevv+lXbAHfkpamD4VKFQRto9WgfOJDwOZ74c/s9Yesv3VvIQ==", "crossorigin": "anonymous"}, + { + "integrity": "sha512-8RxmFOVaKQe/xtg6lbscU9DU0IRhURWEuiI0tXevv+lXbAHfkpamD4VKFQRto9WgfOJDwOZ74c/s9Yesv3VvIQ==", + "crossorigin": "anonymous", + }, ), ( "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/fontawesome.min.css", - {"integrity": "sha512-d0olNN35C6VLiulAobxYHZiXJmq+vl+BGIgAxQtD5+kqudro/xNMvv2yIHAciGHpExsIbKX3iLg+0B6d0k4+ZA==", "crossorigin": "anonymous"}, + { + "integrity": "sha512-d0olNN35C6VLiulAobxYHZiXJmq+vl+BGIgAxQtD5+kqudro/xNMvv2yIHAciGHpExsIbKX3iLg+0B6d0k4+ZA==", + "crossorigin": "anonymous", + }, ), "css/custom.css", ] diff --git a/features/steps/generic_steps.py b/features/steps/generic_steps.py index 2e0e7d35..0f646dd7 100644 --- a/features/steps/generic_steps.py +++ b/features/steps/generic_steps.py @@ -95,8 +95,8 @@ def check_json(path: Union[str, os.PathLike], content: str) -> None: ) -def _apply_context_substitutions(text: str, context) -> str: - """Replace dynamic placeholders with values stored on *context*.""" +def apply_archive_substitutions(text: str, context) -> str: + """Replace archive-related dynamic placeholders with values stored on *context*.""" if hasattr(context, "archive_sha256"): text = text.replace("", context.archive_sha256) if hasattr(context, "archive_sha384"): @@ -109,7 +109,23 @@ def _apply_context_substitutions(text: str, context) -> str: def _json_subset_matches(expected, actual) -> bool: - """Return *True* when *expected* is a subset of *actual* (recursive).""" + """Return *True* when *expected* is a subset of *actual* (recursive). + + **List matching is greedy and order-sensitive.** Each item in *expected* + is matched against *actual* in order, claiming the first unused actual + item that satisfies the subset check. This means an earlier expected + item can consume the only actual item that a later, more specific + expected item would need. For example, with:: + + expected = [{"a": 1}, {"a": 1, "b": 2}] + actual = [{"a": 1, "b": 2}] + + the first expected item matches ``{"a": 1, "b": 2}`` (leaving nothing + for the second), so the overall match returns *False* even though + ``{"a": 1, "b": 2}`` satisfies the second item. Consumers should + **not** rely on non-deterministic matching; instead, pre-order *expected* + lists from most-specific to least-specific to avoid this behaviour. + """ if isinstance(expected, dict): if not isinstance(actual, dict): return False @@ -140,7 +156,7 @@ def check_json_subset(path: Union[str, os.PathLike], content: str, context) -> N Dynamic placeholders (````, ````) in *content* are substituted with values from *context* before parsing. """ - content = _apply_context_substitutions(content, context) + content = apply_archive_substitutions(content, context) with open(path, "r", encoding="UTF-8") as file_to_check: actual_json = json.load(file_to_check) @@ -254,7 +270,7 @@ def check_output(context, line_count=None): context: Behave context with cmd_output and expected text line_count: If set, compare only the first N lines of actual output """ - expected_raw = _apply_context_substitutions(context.text, context) + expected_raw = apply_archive_substitutions(context.text, context) expected_text = multisub( patterns=[ diff --git a/features/steps/manifest_steps.py b/features/steps/manifest_steps.py index e0861b4c..30d0c2f4 100644 --- a/features/steps/manifest_steps.py +++ b/features/steps/manifest_steps.py @@ -9,22 +9,20 @@ from behave import given, then, when # pylint: disable=no-name-in-module -from features.steps.generic_steps import check_file, generate_file, remote_server_path +from features.steps.generic_steps import ( + apply_archive_substitutions, + check_file, + generate_file, + remote_server_path, +) def apply_manifest_substitutions(context, contents: str) -> str: """Apply context-specific substitutions to manifest contents.""" - result = contents.replace( + result = apply_archive_substitutions(contents, context) + result = result.replace( "url: some-remote-server", f"url: file:///{remote_server_path(context)}" ) - if hasattr(context, "archive_sha256"): - result = result.replace("", context.archive_sha256) - if hasattr(context, "archive_sha384"): - result = result.replace("", context.archive_sha384) - if hasattr(context, "archive_sha512"): - result = result.replace("", context.archive_sha512) - if hasattr(context, "archive_url"): - result = result.replace("", context.archive_url) return result diff --git a/tests/test_purl.py b/tests/test_purl.py index 5c417329..c78f2e44 100644 --- a/tests/test_purl.py +++ b/tests/test_purl.py @@ -2,7 +2,8 @@ import pytest -from dfetch.util.purl import remote_url_to_purl +from dfetch.util.purl import vcs_url_to_purl +from dfetch.vcs.archive import archive_url_to_purl @pytest.mark.parametrize( @@ -117,7 +118,7 @@ ], ) def test_remote_url_to_purl(url, expected): - purl = remote_url_to_purl(url) + purl = vcs_url_to_purl(url) if expected is None: assert purl is None else: @@ -173,7 +174,7 @@ def test_remote_url_to_purl(url, expected): def test_archive_url_to_purl_attributes( url, expected_name, expected_namespace, expected_download_url ): - purl = remote_url_to_purl(url) + purl = archive_url_to_purl(url) assert purl.type == "generic" assert purl.name == expected_name assert (purl.namespace or "") == expected_namespace @@ -183,5 +184,5 @@ def test_archive_url_to_purl_attributes( def test_archive_purl_with_version(): url = "https://example.com/lib-1.0.tar.gz" - purl = remote_url_to_purl(url, version="sha256:" + "a" * 64) + purl = archive_url_to_purl(url, version="sha256:" + "a" * 64) assert purl.version == "sha256:" + "a" * 64 From b16855f383faa0ea38e71278f9cc629c40690210 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 19:31:49 +0000 Subject: [PATCH 33/35] Review comments --- dfetch/log.py | 3 ++- dfetch/reporting/sbom_reporter.py | 2 +- dfetch/vcs/archive.py | 8 +++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dfetch/log.py b/dfetch/log.py index 0eb9b402..0025fe25 100644 --- a/dfetch/log.py +++ b/dfetch/log.py @@ -147,8 +147,9 @@ def filter(self, record: logging.LogRecord) -> bool: """Add indentation to the log record message.""" color = "blue" if record.levelno < logging.WARNING else "yellow" - line = markup_escape(str(record.msg)).replace("\n", "\n ") + line = markup_escape(record.getMessage()).replace("\n", "\n ") record.msg = f"{self.prefix}[{color}]{line}[/{color}]" + record.args = () return True diff --git a/dfetch/reporting/sbom_reporter.py b/dfetch/reporting/sbom_reporter.py index 0b94461a..ab5f0aef 100644 --- a/dfetch/reporting/sbom_reporter.py +++ b/dfetch/reporting/sbom_reporter.py @@ -319,7 +319,7 @@ def _apply_archive_refs( @staticmethod def _apply_vcs_refs(component: Component, purl: PackageURL) -> None: """Add VCS external reference and group for a generic VCS dependency.""" - component.group = purl.namespace + component.group = purl.namespace or None vcs_url = purl.qualifiers.get("vcs_url", "") # ExternalReferenceType.VCS does not support ssh:// urls if vcs_url and "ssh://" not in vcs_url: diff --git a/dfetch/vcs/archive.py b/dfetch/vcs/archive.py index 8d1002fa..a8caed9a 100644 --- a/dfetch/vcs/archive.py +++ b/dfetch/vcs/archive.py @@ -32,6 +32,7 @@ import tarfile import tempfile import urllib.parse +import urllib.request import zipfile from collections.abc import Sequence from typing import overload @@ -129,7 +130,7 @@ def is_accessible(self) -> bool: """ parsed = urllib.parse.urlparse(self.url) if parsed.scheme == "file": - return os.path.exists(parsed.path) + return os.path.exists(urllib.request.url2pathname(parsed.path)) if parsed.scheme not in ("http", "https"): return False return self._is_http_reachable(parsed) @@ -176,14 +177,15 @@ def download(self, dest_path: str, algorithm: str | None = None) -> str | None: hasher = hashlib.new(algorithm) if algorithm else None parsed = urllib.parse.urlparse(self.url) if parsed.scheme == "file": + file_path = urllib.request.url2pathname(parsed.path) try: if hasher: - with open(parsed.path, "rb") as src, open(dest_path, "wb") as dst: + with open(file_path, "rb") as src, open(dest_path, "wb") as dst: for chunk in iter(lambda: src.read(65536), b""): dst.write(chunk) hasher.update(chunk) else: - shutil.copy(parsed.path, dest_path) + shutil.copy(file_path, dest_path) except OSError as exc: raise RuntimeError( f"'{self.url}' is not a valid URL or unreachable: {exc}" From b6a19bfeab34d6066d79a0da55d6e9b00de4282a Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 19:49:37 +0000 Subject: [PATCH 34/35] add missing dst path --- example/dfetch.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/example/dfetch.yaml b/example/dfetch.yaml index e2c6b3bd..4ee1e746 100644 --- a/example/dfetch.yaml +++ b/example/dfetch.yaml @@ -39,6 +39,7 @@ manifest: src: src - name: cpputest-git-rev-only + dst: Tests/cpputest-git-rev-only revision: d14505cc9191fcf17ccbd92af1c3409eb3969890 repo-path: cpputest/cpputest.git # Use external git directly From 59e59b2d9d39cec26054b0860097b2adab7cda1a Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 22 Mar 2026 19:51:59 +0000 Subject: [PATCH 35/35] Update demo magic hash --- doc/_ext/sphinxcontrib_asciinema/.dfetch_data.yaml | 4 ++-- doc/generate-casts/demo-magic/.dfetch_data.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/_ext/sphinxcontrib_asciinema/.dfetch_data.yaml b/doc/_ext/sphinxcontrib_asciinema/.dfetch_data.yaml index 49a36e79..4f9d5878 100644 --- a/doc/_ext/sphinxcontrib_asciinema/.dfetch_data.yaml +++ b/doc/_ext/sphinxcontrib_asciinema/.dfetch_data.yaml @@ -2,8 +2,8 @@ # For more info see https://dfetch.rtfd.io/en/latest/getting_started.html dfetch: branch: master - hash: 5b0a3a18e1e83d363f9eb0ac4b3fca17 - last_fetch: 26/01/2026, 23:40:59 + hash: dcd1473e1a3ca613b804e3e51e7ee342 + last_fetch: 22/03/2026, 19:52:31 patch: - doc/_ext/patches/001-autoformat-sphinxcontrib.asciinema.patch - doc/_ext/patches/002-fix-options-sphinxcontrib.asciinema.patch diff --git a/doc/generate-casts/demo-magic/.dfetch_data.yaml b/doc/generate-casts/demo-magic/.dfetch_data.yaml index 9c5c2598..07045916 100644 --- a/doc/generate-casts/demo-magic/.dfetch_data.yaml +++ b/doc/generate-casts/demo-magic/.dfetch_data.yaml @@ -2,8 +2,8 @@ # For more info see https://dfetch.rtfd.io/en/latest/getting_started.html dfetch: branch: master - hash: 476a29a874df3840ac2bd916e7097b92 - last_fetch: 14/10/2025, 19:16:12 + hash: d67278c164d7a103c46ff953560f1f0a + last_fetch: 22/03/2026, 19:50:56 patch: '' remote_url: https://github.com/paxtonhare/demo-magic.git revision: 2a2f439c26a93286dc2adc6ef2a81755af83f36e