Skip to content

Commit 68bad0c

Browse files
PascalThuetclaude
andcommitted
fix: address Copilot review on bounded reads and redirect-safety
- read_response_limited: read in a loop until EOF or one byte past the limit instead of a single read(max_bytes + 1). A server using chunked transfer encoding can return fewer bytes per read() than requested while streaming more than max_bytes total, defeating the single-read bound. Add regression tests for the short-read and within-limit paths. - _download_security: annotate _raise / _raise_from as NoReturn so type checkers treat call sites as unreachable. - Extract the duplicated is_https_or_localhost_http redirect-safety predicate into _download_security and import it from both _github_http and authentication/http so the rule lives in one place. - azure_devops: stop catching broad ValueError/KeyError around token acquisition; give the bounded read a dedicated _TokenResponseTooLarge type and catch only URLError, OSError, JSONDecodeError, and that type so unrelated programming errors still surface. - tests: make response mocks faithful streams (advancing cursor, b"" at EOF) so the bounded read loop terminates as it would against a real http.client.HTTPResponse. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 5a15b77 commit 68bad0c

11 files changed

Lines changed: 123 additions & 45 deletions

src/specify_cli/_download_security.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import stat
88
import zipfile
99
from pathlib import Path, PurePosixPath
10-
from typing import TypeVar
10+
from typing import NoReturn, TypeVar
11+
from urllib.parse import urlparse
1112

1213

1314
ErrorT = TypeVar("ErrorT", bound=Exception)
@@ -20,11 +21,22 @@
2021
SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$")
2122

2223

23-
def _raise(error_type: type[ErrorT], message: str) -> None:
24+
def is_https_or_localhost_http(url: str) -> bool:
25+
"""Return True if *url* is HTTPS, or HTTP limited to loopback hosts.
26+
27+
Shared redirect-safety predicate used by the GitHub and auth HTTP redirect
28+
handlers so the rule (and any future tightening of it) lives in one place.
29+
"""
30+
parsed = urlparse(url)
31+
is_localhost = parsed.hostname in ("localhost", "127.0.0.1", "::1")
32+
return parsed.scheme == "https" or (parsed.scheme == "http" and is_localhost)
33+
34+
35+
def _raise(error_type: type[ErrorT], message: str) -> NoReturn:
2436
raise error_type(message)
2537

2638

27-
def _raise_from(error_type: type[ErrorT], message: str, exc: Exception) -> None:
39+
def _raise_from(error_type: type[ErrorT], message: str, exc: Exception) -> NoReturn:
2840
raise error_type(message) from exc
2941

3042

@@ -35,11 +47,25 @@ def read_response_limited(
3547
error_type: type[ErrorT] = ValueError,
3648
label: str = "download",
3749
) -> bytes:
38-
"""Read at most *max_bytes* from a response object."""
39-
data = response.read(max_bytes + 1)
40-
if len(data) > max_bytes:
50+
"""Read at most *max_bytes* from a response object.
51+
52+
``response.read(n)`` is only guaranteed to return *up to* ``n`` bytes and may
53+
return fewer even when more data is pending (e.g. chunked transfer encoding),
54+
so a single ``read(max_bytes + 1)`` cannot enforce the bound on its own. Read
55+
in a loop until EOF or until one byte past the limit has been accumulated.
56+
"""
57+
chunks: list[bytes] = []
58+
total = 0
59+
limit = max_bytes + 1
60+
while total < limit:
61+
chunk = response.read(min(READ_CHUNK_SIZE, limit - total))
62+
if not chunk:
63+
break
64+
chunks.append(chunk)
65+
total += len(chunk)
66+
if total > max_bytes:
4167
_raise(error_type, f"{label} exceeds maximum size of {max_bytes} bytes")
42-
return data
68+
return b"".join(chunks)
4369

4470

4571
def normalize_sha256(value: object, *, error_type: type[ErrorT] = ValueError) -> str | None:

src/specify_cli/_github_http.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from typing import Dict
1313
from urllib.parse import urlparse
1414

15+
from specify_cli._download_security import is_https_or_localhost_http
16+
1517
# GitHub-owned hostnames that should receive the Authorization header.
1618
# Includes codeload.github.com because GitHub archive URL downloads
1719
# (e.g. /archive/refs/tags/<tag>.zip) redirect there and require auth
@@ -55,12 +57,6 @@ def build_github_request(url: str) -> urllib.request.Request:
5557
return urllib.request.Request(url, headers=headers)
5658

5759

58-
def _is_https_or_localhost_http(url: str) -> bool:
59-
parsed = urlparse(url)
60-
is_localhost = parsed.hostname in ("localhost", "127.0.0.1", "::1")
61-
return parsed.scheme == "https" or (parsed.scheme == "http" and is_localhost)
62-
63-
6460
class _StripAuthOnRedirect(urllib.request.HTTPRedirectHandler):
6561
"""Redirect handler that drops the Authorization header when leaving GitHub.
6662
@@ -70,7 +66,7 @@ class _StripAuthOnRedirect(urllib.request.HTTPRedirectHandler):
7066
"""
7167

7268
def redirect_request(self, req, fp, code, msg, headers, newurl):
73-
if not _is_https_or_localhost_http(newurl):
69+
if not is_https_or_localhost_http(newurl):
7470
raise urllib.error.URLError(
7571
f"Refusing unsafe redirect to non-HTTPS URL: {newurl}"
7672
)

src/specify_cli/authentication/azure_devops.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
_ADO_RESOURCE_ID = "499b84ac-1321-427f-aa17-267ca6975798"
1818

1919

20+
class _TokenResponseTooLarge(Exception):
21+
"""Raised when an Azure AD token response exceeds the bounded read limit."""
22+
23+
2024
class AzureDevOpsAuth(AuthProvider):
2125
"""Azure DevOps authentication provider.
2226
@@ -115,6 +119,7 @@ def _acquire_via_client_credentials(entry: AuthConfigEntry) -> str | None:
115119
payload = _json.loads(
116120
read_response_limited(
117121
resp,
122+
error_type=_TokenResponseTooLarge,
118123
label="Azure DevOps token response",
119124
).decode("utf-8")
120125
)
@@ -123,7 +128,10 @@ def _acquire_via_client_credentials(entry: AuthConfigEntry) -> str | None:
123128
except (
124129
urllib.error.URLError,
125130
OSError,
126-
ValueError,
127-
KeyError,
131+
_json.JSONDecodeError,
132+
_TokenResponseTooLarge,
128133
):
134+
# Network failure, malformed JSON, or an oversized response — fall
135+
# through to the next strategy. Unrelated programming errors (other
136+
# ValueErrors, KeyErrors) intentionally propagate so they surface.
129137
return None

src/specify_cli/authentication/http.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from fnmatch import fnmatch
1717
from urllib.parse import urlparse
1818

19+
from .._download_security import is_https_or_localhost_http
1920
from . import get_provider
2021
from .config import AuthConfigEntry, _default_config_path, find_entries_for_url, load_auth_config
2122

@@ -56,12 +57,6 @@ def _hostname_in_hosts(hostname: str, hosts: tuple[str, ...]) -> bool:
5657
return any(p == hostname or fnmatch(hostname, p) for p in hosts)
5758

5859

59-
def _is_https_or_localhost_http(url: str) -> bool:
60-
parsed = urlparse(url)
61-
is_localhost = parsed.hostname in ("localhost", "127.0.0.1", "::1")
62-
return parsed.scheme == "https" or (parsed.scheme == "http" and is_localhost)
63-
64-
6560
class _StripAuthOnRedirect(urllib.request.HTTPRedirectHandler):
6661
"""Drop ``Authorization`` when a redirect leaves the entry's declared hosts."""
6762

@@ -70,7 +65,7 @@ def __init__(self, hosts: tuple[str, ...]) -> None:
7065
self._hosts = hosts
7166

7267
def redirect_request(self, req, fp, code, msg, headers, newurl):
73-
if not _is_https_or_localhost_http(newurl):
68+
if not is_https_or_localhost_http(newurl):
7469
raise urllib.error.URLError(
7570
f"Refusing unsafe redirect to non-HTTPS URL: {newurl}"
7671
)

tests/integrations/test_integration_catalog.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,16 @@ class FakeResponse:
177177
def __init__(self, data, url=""):
178178
self._data = json.dumps(data).encode()
179179
self._url = url if isinstance(url, str) else url.full_url
180+
self._pos = 0
180181

181-
def read(self, _size=-1):
182-
return self._data
182+
def read(self, size=-1):
183+
# Advance a cursor and return b"" at EOF like a real stream, so
184+
# read_response_limited's bounded loop terminates.
185+
if size is None or size < 0:
186+
size = len(self._data) - self._pos
187+
out = self._data[self._pos : self._pos + size]
188+
self._pos += len(out)
189+
return out
183190

184191
def geturl(self):
185192
return self._url
@@ -553,9 +560,16 @@ class FakeResponse:
553560
def __init__(self, data, url=""):
554561
self._data = json.dumps(data).encode()
555562
self._url = url if isinstance(url, str) else url.full_url
556-
557-
def read(self, _size=-1):
558-
return self._data
563+
self._pos = 0
564+
565+
def read(self, size=-1):
566+
# Advance a cursor and return b"" at EOF like a real stream, so
567+
# read_response_limited's bounded loop terminates.
568+
if size is None or size < 0:
569+
size = len(self._data) - self._pos
570+
out = self._data[self._pos : self._pos + size]
571+
self._pos += len(out)
572+
return out
559573

560574
def geturl(self):
561575
return self._url

tests/test_authentication.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from __future__ import annotations
1515

1616
import base64
17+
import io
1718
import json
1819
import os
1920

@@ -497,7 +498,7 @@ def test_resolve_token_azure_ad_success(self, monkeypatch):
497498
tenant_id="tid", client_id="cid", client_secret_env="MY_SECRET",
498499
)
499500
mock_resp = MagicMock()
500-
mock_resp.read.return_value = b'{"access_token": "ad-acquired-token"}'
501+
mock_resp.read.side_effect = io.BytesIO(b'{"access_token": "ad-acquired-token"}').read
501502
mock_resp.__enter__ = lambda s: s
502503
mock_resp.__exit__ = MagicMock(return_value=False)
503504
with patch("urllib.request.urlopen", return_value=mock_resp):
@@ -825,7 +826,7 @@ def _capture_request(self):
825826
def side_effect(req, timeout=None):
826827
captured["request"] = req
827828
body = _json.dumps({"tag_name": "v9.9.9"}).encode()
828-
resp = MagicMock(); resp.read.return_value = body
829+
resp = MagicMock(); resp.read.side_effect = io.BytesIO(body).read
829830
cm = MagicMock(); cm.__enter__.return_value = resp; cm.__exit__.return_value = False
830831
return cm
831832
return captured, side_effect

tests/test_download_security.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,23 @@
2525

2626

2727
class _Response:
28-
def __init__(self, data: bytes):
28+
"""Faithful stream stand-in: read() advances a cursor and returns b"" at EOF."""
29+
30+
def __init__(self, data: bytes, *, chunk: int | None = None):
2931
self.data = data
32+
self.pos = 0
33+
# When set, never return more than *chunk* bytes per call even if more is
34+
# requested — simulates short reads (e.g. chunked transfer encoding).
35+
self.chunk = chunk
3036

3137
def read(self, size: int = -1) -> bytes:
32-
return self.data if size < 0 else self.data[:size]
38+
if size < 0:
39+
size = len(self.data) - self.pos
40+
if self.chunk is not None:
41+
size = min(size, self.chunk)
42+
out = self.data[self.pos : self.pos + size]
43+
self.pos += len(out)
44+
return out
3345

3446

3547
class _CustomZipError(ValueError):
@@ -93,6 +105,19 @@ def test_read_response_limited_rejects_oversized_download():
93105
read_response_limited(_Response(b"abcde"), max_bytes=4)
94106

95107

108+
def test_read_response_limited_returns_full_body_within_limit():
109+
assert read_response_limited(_Response(b"abcde"), max_bytes=10) == b"abcde"
110+
111+
112+
def test_read_response_limited_enforces_bound_under_short_reads():
113+
# A server that streams more than max_bytes total while every read() returns
114+
# fewer bytes than requested (chunked encoding) must still be rejected — a
115+
# single read(max_bytes + 1) could be fooled, the accumulating loop cannot.
116+
response = _Response(b"x" * 100, chunk=8)
117+
with pytest.raises(ValueError, match="exceeds maximum size"):
118+
read_response_limited(response, max_bytes=16)
119+
120+
96121
def test_remote_downloads_do_not_use_unbounded_response_reads():
97122
offenders = []
98123
for path in (REPO_ROOT / "src" / "specify_cli").rglob("*.py"):

tests/test_extensions.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"""
1111

1212
import pytest
13+
import io
1314
import json
1415
import hashlib
1516
import os
@@ -2820,7 +2821,7 @@ def test_fetch_single_catalog_sends_auth_header(self, temp_dir, monkeypatch):
28202821

28212822
catalog_data = {"schema_version": "1.0", "extensions": {}}
28222823
mock_response = MagicMock()
2823-
mock_response.read.return_value = json.dumps(catalog_data).encode()
2824+
mock_response.read.side_effect = io.BytesIO(json.dumps(catalog_data).encode()).read
28242825
mock_response.__enter__ = lambda s: s
28252826
mock_response.__exit__ = MagicMock(return_value=False)
28262827
mock_response.geturl.return_value = "https://raw.githubusercontent.com/org/repo/main/catalog.json"
@@ -2904,7 +2905,7 @@ def test_download_extension_sends_auth_header(self, temp_dir, monkeypatch):
29042905
zip_bytes = zip_buf.getvalue()
29052906

29062907
mock_response = MagicMock()
2907-
mock_response.read.return_value = zip_bytes
2908+
mock_response.read.side_effect = io.BytesIO(zip_bytes).read
29082909
mock_response.__enter__ = lambda s: s
29092910
mock_response.__exit__ = MagicMock(return_value=False)
29102911

@@ -2937,7 +2938,7 @@ def test_download_extension_verifies_sha256(self, temp_dir):
29372938
catalog = self._make_catalog(temp_dir)
29382939
zip_bytes = b"fake zip data"
29392940
mock_response = MagicMock()
2940-
mock_response.read.return_value = zip_bytes
2941+
mock_response.read.side_effect = io.BytesIO(zip_bytes).read
29412942
mock_response.__enter__ = lambda s: s
29422943
mock_response.__exit__ = MagicMock(return_value=False)
29432944
ext_info = {
@@ -2960,7 +2961,7 @@ def test_download_extension_rejects_sha256_mismatch(self, temp_dir):
29602961

29612962
catalog = self._make_catalog(temp_dir)
29622963
mock_response = MagicMock()
2963-
mock_response.read.return_value = b"fake zip data"
2964+
mock_response.read.side_effect = io.BytesIO(b"fake zip data").read
29642965
mock_response.__enter__ = lambda s: s
29652966
mock_response.__exit__ = MagicMock(return_value=False)
29662967
ext_info = {
@@ -4034,7 +4035,7 @@ def test_download_extension_allows_bundled_with_url(self, temp_dir):
40344035
}
40354036

40364037
mock_response = MagicMock()
4037-
mock_response.read.return_value = b"fake zip data"
4038+
mock_response.read.side_effect = io.BytesIO(b"fake zip data").read
40384039
mock_response.__enter__ = lambda s: s
40394040
mock_response.__exit__ = MagicMock(return_value=False)
40404041

tests/test_presets.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"""
1212

1313
import pytest
14+
import io
1415
import json
1516
import hashlib
1617
import tempfile
@@ -1522,7 +1523,7 @@ def test_fetch_single_catalog_sends_auth_header(self, project_dir, monkeypatch):
15221523

15231524
catalog_data = {"schema_version": "1.0", "presets": {}}
15241525
mock_response = MagicMock()
1525-
mock_response.read.return_value = json.dumps(catalog_data).encode()
1526+
mock_response.read.side_effect = io.BytesIO(json.dumps(catalog_data).encode()).read
15261527
mock_response.__enter__ = lambda s: s
15271528
mock_response.__exit__ = MagicMock(return_value=False)
15281529
mock_response.geturl.return_value = "https://raw.githubusercontent.com/org/repo/main/presets/catalog.json"
@@ -1563,7 +1564,7 @@ def test_download_pack_sends_auth_header(self, project_dir, monkeypatch):
15631564
zip_bytes = zip_buf.getvalue()
15641565

15651566
mock_response = MagicMock()
1566-
mock_response.read.return_value = zip_bytes
1567+
mock_response.read.side_effect = io.BytesIO(zip_bytes).read
15671568
mock_response.__enter__ = lambda s: s
15681569
mock_response.__exit__ = MagicMock(return_value=False)
15691570

@@ -1638,7 +1639,7 @@ def test_download_pack_verifies_sha256(self, project_dir):
16381639
catalog = PresetCatalog(project_dir)
16391640
zip_bytes = b"fake zip data"
16401641
mock_response = MagicMock()
1641-
mock_response.read.return_value = zip_bytes
1642+
mock_response.read.side_effect = io.BytesIO(zip_bytes).read
16421643
mock_response.__enter__ = lambda s: s
16431644
mock_response.__exit__ = MagicMock(return_value=False)
16441645
pack_info = {
@@ -1662,7 +1663,7 @@ def test_download_pack_rejects_sha256_mismatch(self, project_dir):
16621663

16631664
catalog = PresetCatalog(project_dir)
16641665
mock_response = MagicMock()
1665-
mock_response.read.return_value = b"fake zip data"
1666+
mock_response.read.side_effect = io.BytesIO(b"fake zip data").read
16661667
mock_response.__enter__ = lambda s: s
16671668
mock_response.__exit__ = MagicMock(return_value=False)
16681669
pack_info = {

tests/test_upgrade.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
(if installed) with `--disable-socket` as an extra safety net.
99
"""
1010

11+
import io
1112
import json
1213
import urllib.error
1314
import importlib.metadata
@@ -39,7 +40,10 @@
3940
def _mock_urlopen_response(payload: dict) -> MagicMock:
4041
body = json.dumps(payload).encode("utf-8")
4142
resp = MagicMock()
42-
resp.read.return_value = body
43+
# Back read() with a real stream so it advances and returns b"" at EOF,
44+
# matching http.client.HTTPResponse (a fixed return_value would loop forever
45+
# under read_response_limited's bounded read loop).
46+
resp.read.side_effect = io.BytesIO(body).read
4347
cm = MagicMock()
4448
cm.__enter__.return_value = resp
4549
cm.__exit__.return_value = False

0 commit comments

Comments
 (0)