From 24b46121b0dc75778b08676faa2ab734a1c2686d Mon Sep 17 00:00:00 2001 From: patchwright <292882882+patchwright@users.noreply.github.com> Date: Thu, 11 Jun 2026 21:20:07 +0200 Subject: [PATCH] Fix path/branch extraction stripping every marker occurrence GitHubPlatform and GitLabPlatform built `path`/`branch` with str.replace(marker, ""), which removes *every* occurrence of the marker rather than only the leading one. A URL whose file path or branch name contained the same segment again was silently corrupted: parse('.../blob/main/src/blob/utils.py').path -> 'main/srcutils.py' parse('.../tree/feature/tree/x').branch -> 'featurex' The leading marker is already guaranteed by the preceding startswith check, so slice it off instead. Slicing (rather than str.removeprefix) keeps the declared Python 3.8 compatibility. Adds regression tests for nested /blob/ paths and /tree/ branch names on both GitHub and GitLab. --- changes/150.bugfix | 1 + giturlparse/platforms/github.py | 4 +- giturlparse/platforms/gitlab.py | 6 +-- giturlparse/tests/test_parse.py | 74 +++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 changes/150.bugfix diff --git a/changes/150.bugfix b/changes/150.bugfix new file mode 100644 index 0000000..9e7c212 --- /dev/null +++ b/changes/150.bugfix @@ -0,0 +1 @@ +Fix ``path`` and ``branch`` extraction removing *every* ``/blob/`` and ``/tree/`` occurrence instead of only the leading marker, which corrupted file paths and branch names containing those segments. diff --git a/giturlparse/platforms/github.py b/giturlparse/platforms/github.py index e31943f..d795456 100644 --- a/giturlparse/platforms/github.py +++ b/giturlparse/platforms/github.py @@ -34,7 +34,7 @@ class GitHubPlatform(BasePlatform): def clean_data(data): data = BasePlatform.clean_data(data) if data["path_raw"].startswith("/blob/"): - data["path"] = data["path_raw"].replace("/blob/", "") + data["path"] = data["path_raw"][len("/blob/") :] if data["path_raw"].startswith("/tree/"): - data["branch"] = data["path_raw"].replace("/tree/", "") + data["branch"] = data["path_raw"][len("/tree/") :] return data diff --git a/giturlparse/platforms/gitlab.py b/giturlparse/platforms/gitlab.py index 44d3cd5..7c139c8 100644 --- a/giturlparse/platforms/gitlab.py +++ b/giturlparse/platforms/gitlab.py @@ -38,9 +38,9 @@ class GitLabPlatform(BasePlatform): def clean_data(data): data = BasePlatform.clean_data(data) if data["path_raw"].startswith("/blob/"): - data["path"] = data["path_raw"].replace("/blob/", "") + data["path"] = data["path_raw"][len("/blob/") :] if data["path_raw"].startswith("/-/blob/"): - data["path"] = data["path_raw"].replace("/-/blob/", "") + data["path"] = data["path_raw"][len("/-/blob/") :] if data["path_raw"].startswith("/-/tree/"): - data["branch"] = data["path_raw"].replace("/-/tree/", "") + data["branch"] = data["path_raw"][len("/-/tree/") :] return data diff --git a/giturlparse/tests/test_parse.py b/giturlparse/tests/test_parse.py index bd9e70d..0d18348 100644 --- a/giturlparse/tests/test_parse.py +++ b/giturlparse/tests/test_parse.py @@ -481,6 +481,80 @@ }, ), ), + ( + "HTTPS", + ( + # Regression: a file path that itself contains a "blob" directory must + # not have the inner "/blob/" stripped (only the leading marker). + "https://github.com/nephila/giturlparse/blob/master/giturlparse/blob/data.py", + { + "host": "github.com", + "resource": "github.com", + "port": "", + "user": "git", + "owner": "nephila", + "repo": "giturlparse", + "name": "giturlparse", + "groups": [], + "path": "master/giturlparse/blob/data.py", + "path_raw": "/blob/master/giturlparse/blob/data.py", + "pathname": "/nephila/giturlparse/blob/master/giturlparse/blob/data.py", + "branch": "", + "protocol": "https", + "protocols": ["https"], + "platform": "github", + }, + ), + ), + ( + "HTTPS", + ( + # Regression: a branch name containing "/tree/" must be preserved + # in full rather than having every "/tree/" removed. + "https://github.com/nephila/giturlparse/tree/feature/tree/x", + { + "host": "github.com", + "resource": "github.com", + "port": "", + "user": "git", + "owner": "nephila", + "repo": "giturlparse", + "name": "giturlparse", + "groups": [], + "path": "", + "path_raw": "/tree/feature/tree/x", + "pathname": "/nephila/giturlparse/tree/feature/tree/x", + "branch": "feature/tree/x", + "protocol": "https", + "protocols": ["https"], + "platform": "github", + }, + ), + ), + ( + "HTTPS", + ( + # Regression (GitLab): inner "/blob/" in the file path must survive. + "https://gitlab.com/nephila/giturlparse/-/blob/master/giturlparse/blob/data.py", + { + "host": "gitlab.com", + "resource": "gitlab.com", + "port": "", + "user": "git", + "owner": "nephila", + "repo": "giturlparse", + "name": "giturlparse", + "groups": [], + "path": "master/giturlparse/blob/data.py", + "path_raw": "/-/blob/master/giturlparse/blob/data.py", + "pathname": "/nephila/giturlparse/-/blob/master/giturlparse/blob/data.py", + "branch": "", + "protocol": "https", + "protocols": ["https"], + "platform": "gitlab", + }, + ), + ), ( "HTTPS", (