From 73a245472bb89514b13bd3df4e7fcf7ed4252d04 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Sat, 16 May 2026 12:19:19 +0530 Subject: [PATCH 1/8] feat add fetch_url_tool for reading specific web page URLs in AI chat Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/tools/web_tools.py | 109 +++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 backend/utils/retrieval/tools/web_tools.py diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py new file mode 100644 index 00000000000..d04a9f8d2f0 --- /dev/null +++ b/backend/utils/retrieval/tools/web_tools.py @@ -0,0 +1,109 @@ +""" +Tools for fetching content from specific URLs. +""" + +import re +import logging +from html.parser import HTMLParser +from langchain_core.tools import tool +from utils.http_client import get_webhook_client +from utils.log_sanitizer import sanitize + +logger = logging.getLogger(__name__) + +_SKIP_TAGS = {'script', 'style', 'noscript', 'head', 'meta', 'link', 'svg', 'iframe', 'nav', 'footer'} +_MAX_CONTENT_CHARS = 8000 + + +_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'} + + +class _TextExtractor(HTMLParser): + def __init__(self): + super().__init__() + self._skip_depth = 0 + self.chunks = [] + + def handle_starttag(self, tag, attrs): + if tag in _SKIP_TAGS: + self._skip_depth += 1 + elif tag in _BLOCK_TAGS and self._skip_depth == 0 and self.chunks: + self.chunks.append('\n') + + def handle_endtag(self, tag): + if tag in _SKIP_TAGS and self._skip_depth > 0: + self._skip_depth -= 1 + + def handle_data(self, data): + if self._skip_depth == 0: + text = data.strip() + if text: + self.chunks.append(text) + + +def _html_to_text(html: str) -> str: + parser = _TextExtractor() + try: + parser.feed(html) + except Exception: + pass + text = ' '.join(parser.chunks) + text = re.sub(r' \n ', '\n', text) + text = re.sub(r'\n{3,}', '\n\n', text) + return text.strip() + + +@tool +async def fetch_url_tool(url: str) -> str: + """ + Fetch and read the content of a specific web page URL. + + Use this tool when: + - The user shares a direct URL and asks you to read, summarize, or analyze it + - The user says "check this link", "what does this page say", "summarize this article" with a URL + - You need to read the actual content at a specific web address + + DO NOT use this tool for general web searches — use web_search instead. + + Args: + url: The full URL to fetch (must start with http:// or https://) + + Returns: + The readable text content of the page (up to 8000 characters) + """ + logger.info(f"fetch_url_tool called - url: {url}") + + if not url.startswith(('http://', 'https://')): + return "Error: URL must start with http:// or https://" + + try: + client = get_webhook_client() + headers = { + 'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + } + response = await client.get(url, headers=headers, timeout=15.0, follow_redirects=True) + + if response.status_code != 200: + logger.warning(f"fetch_url_tool - HTTP {response.status_code} for {url}") + return f"Error: Could not fetch page (HTTP {response.status_code})" + + content_type = response.headers.get('content-type', '') + if 'text/html' in content_type or 'text/plain' in content_type or not content_type: + text = _html_to_text(response.text) + else: + return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read." + + if not text: + return "Error: Page appears to be empty or has no readable text content." + + if len(text) > _MAX_CONTENT_CHARS: + text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]' + + logger.info(f"fetch_url_tool - fetched {len(text)} chars from {url}") + return f"Content from {url}:\n\n{text}" + + except Exception as e: + logger.error(f"fetch_url_tool - error fetching {url}: {sanitize(str(e))}") + return f"Error: Failed to fetch the URL. {sanitize(str(e))}" From eae612dc9151a4719bc70fef727c8c7961707bb8 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Sat, 16 May 2026 12:19:23 +0530 Subject: [PATCH 2/8] feat export fetch_url_tool from tools __init__ Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/tools/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/utils/retrieval/tools/__init__.py b/backend/utils/retrieval/tools/__init__.py index f9bc2f6cccc..d511948a6ef 100644 --- a/backend/utils/retrieval/tools/__init__.py +++ b/backend/utils/retrieval/tools/__init__.py @@ -53,6 +53,9 @@ from .preference_tools import ( save_user_preference_tool, ) +from .web_tools import ( + fetch_url_tool, +) __all__ = [ 'get_conversations_tool', @@ -79,4 +82,5 @@ 'get_screen_activity_tool', 'search_screen_activity_tool', 'save_user_preference_tool', + 'fetch_url_tool', ] From 81a187d72b48367f4f94a808397e83c6eaf70711 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Sat, 16 May 2026 12:19:26 +0530 Subject: [PATCH 3/8] feat wire fetch_url_tool into CORE_TOOLS so AI chat can read direct URLs Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/agentic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/utils/retrieval/agentic.py b/backend/utils/retrieval/agentic.py index 84c3697ffb4..45218b741f0 100644 --- a/backend/utils/retrieval/agentic.py +++ b/backend/utils/retrieval/agentic.py @@ -44,6 +44,7 @@ get_screen_activity_tool, search_screen_activity_tool, save_user_preference_tool, + fetch_url_tool, ) from utils.retrieval.tools.app_tools import load_app_tools, get_tool_status_message from utils.retrieval.safety import AgentSafetyGuard, SafetyGuardError @@ -96,6 +97,7 @@ def decorator(func): get_screen_activity_tool, search_screen_activity_tool, save_user_preference_tool, + fetch_url_tool, ] # Standard tool names (used to detect app tools by exclusion) @@ -133,6 +135,7 @@ def get_tool_display_name(tool_name: str, tool_obj: Optional[Any] = None) -> str 'get_screen_activity_tool': 'Checking screen activity', 'search_screen_activity_tool': 'Searching screen activity', 'save_user_preference_tool': 'Saving preference', + 'fetch_url_tool': 'Reading page', } if tool_name in tool_display_map: From 34a8ef963f44ad0cf9cbd0faf614cf174da093a6 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Sat, 16 May 2026 20:41:00 +0530 Subject: [PATCH 4/8] feat add get_web_fetch_client isolated from webhook pool for URL fetches Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/http_client.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/backend/utils/http_client.py b/backend/utils/http_client.py index 223d7dfdacf..e5ca7fd1022 100644 --- a/backend/utils/http_client.py +++ b/backend/utils/http_client.py @@ -230,6 +230,7 @@ def get_tts_semaphore() -> asyncio.Semaphore: _auth_client: httpx.AsyncClient | None = None _stt_client: httpx.AsyncClient | None = None _tts_client: httpx.AsyncClient | None = None +_web_fetch_client: httpx.AsyncClient | None = None def get_webhook_client() -> httpx.AsyncClient: @@ -299,10 +300,25 @@ def get_tts_client() -> httpx.AsyncClient: return _tts_client +def get_web_fetch_client() -> httpx.AsyncClient: + """Return a shared async HTTP client for user-initiated URL fetches. + + Isolated from the webhook pool so slow/stalled external pages don't + compete with partner webhook delivery slots. + """ + global _web_fetch_client + if _web_fetch_client is None: + _web_fetch_client = httpx.AsyncClient( + timeout=httpx.Timeout(15.0, connect=5.0), + limits=httpx.Limits(max_connections=16, max_keepalive_connections=4), + ) + return _web_fetch_client + + async def close_all_clients(): """Close all shared HTTP clients. Call at app shutdown.""" - global _webhook_client, _maps_client, _auth_client, _stt_client, _tts_client - for client in (_webhook_client, _maps_client, _auth_client, _stt_client, _tts_client): + global _webhook_client, _maps_client, _auth_client, _stt_client, _tts_client, _web_fetch_client + for client in (_webhook_client, _maps_client, _auth_client, _stt_client, _tts_client, _web_fetch_client): if client is not None: try: await client.aclose() @@ -313,6 +329,7 @@ async def close_all_clients(): _auth_client = None _tts_client = None _stt_client = None + _web_fetch_client = None # Reset stateful registries _semaphores.clear() _webhook_circuit_breakers.clear() From 8773b8394a4148ebb45c4793609aa831914c5e0c Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Sat, 16 May 2026 20:41:08 +0530 Subject: [PATCH 5/8] fix fetch_url_tool SSRF guard, stream body cap, sanitize URL logs, fix content-type check Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/tools/web_tools.py | 155 +++++++++++++++++---- 1 file changed, 127 insertions(+), 28 deletions(-) diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py index d04a9f8d2f0..1bd56c0947c 100644 --- a/backend/utils/retrieval/tools/web_tools.py +++ b/backend/utils/retrieval/tools/web_tools.py @@ -2,20 +2,60 @@ Tools for fetching content from specific URLs. """ +import asyncio +import ipaddress import re import logging from html.parser import HTMLParser +from urllib.parse import urlparse + from langchain_core.tools import tool -from utils.http_client import get_webhook_client + +from utils.http_client import get_web_fetch_client from utils.log_sanitizer import sanitize logger = logging.getLogger(__name__) _SKIP_TAGS = {'script', 'style', 'noscript', 'head', 'meta', 'link', 'svg', 'iframe', 'nav', 'footer'} +_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'} _MAX_CONTENT_CHARS = 8000 +_MAX_BODY_BYTES = 512 * 1024 # cap before HTML parsing +_MAX_REDIRECTS = 5 + +# RFC-1918, loopback, link-local (incl. cloud metadata), carrier-grade NAT, IPv6 private +_PRIVATE_NETWORKS = [ + ipaddress.ip_network('127.0.0.0/8'), + ipaddress.ip_network('10.0.0.0/8'), + ipaddress.ip_network('172.16.0.0/12'), + ipaddress.ip_network('192.168.0.0/16'), + ipaddress.ip_network('169.254.0.0/16'), + ipaddress.ip_network('100.64.0.0/10'), + ipaddress.ip_network('::1/128'), + ipaddress.ip_network('fe80::/10'), + ipaddress.ip_network('fc00::/7'), +] + +_PARSEABLE_TYPES = ('text/html', 'text/plain', 'application/xhtml+xml', 'application/xml') + + +def _is_private_ip(ip_str: str) -> bool: + try: + ip = ipaddress.ip_address(ip_str) + return any(ip in net for net in _PRIVATE_NETWORKS) + except ValueError: + return True # unparseable → treat as blocked -_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'} +async def _hostname_is_public(hostname: str) -> bool: + """Resolve hostname and return True only if every IP is a public address.""" + try: + loop = asyncio.get_running_loop() + results = await loop.getaddrinfo(hostname, None) + if not results: + return False + return not any(_is_private_ip(r[4][0]) for r in results) + except Exception: + return False class _TextExtractor(HTMLParser): @@ -53,6 +93,62 @@ def _html_to_text(html: str) -> str: return text.strip() +async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]: + """ + Fetch *url* with SSRF guard, manual redirect following, and a body-size cap. + Returns (status_code, content_type, body_text). + Raises ValueError on SSRF/redirect violations. + """ + client = get_web_fetch_client() + + for _ in range(_MAX_REDIRECTS + 1): + if not url.startswith(('http://', 'https://')): + raise ValueError('Redirect target must use http:// or https://') + + parsed = urlparse(url) + hostname = parsed.hostname or '' + if not hostname: + raise ValueError('Invalid URL: no hostname') + + if not await _hostname_is_public(hostname): + raise ValueError('URL resolves to a private or reserved address') + + redirect_url = None + status = 0 + content_type = '' + body_text = '' + + async with client.stream('GET', url, headers=headers, follow_redirects=False) as response: + status = response.status_code + content_type = response.headers.get('content-type', '') + + if status in (301, 302, 303, 307, 308): + location = response.headers.get('location', '') + redirect_url = location + else: + cl_header = response.headers.get('content-length') + if cl_header and int(cl_header) > _MAX_BODY_BYTES: + return status, content_type, '' + + chunks = [] + total = 0 + async for chunk in response.aiter_bytes(chunk_size=8192): + total += len(chunk) + chunks.append(chunk) + if total >= _MAX_BODY_BYTES: + break + + body_text = b''.join(chunks).decode('utf-8', errors='replace') + + if redirect_url is not None: + url = redirect_url + continue + + return status, content_type, body_text + + raise ValueError('Too many redirects') + + @tool async def fetch_url_tool(url: str) -> str: """ @@ -71,39 +167,42 @@ async def fetch_url_tool(url: str) -> str: Returns: The readable text content of the page (up to 8000 characters) """ - logger.info(f"fetch_url_tool called - url: {url}") + logger.info(f"fetch_url_tool called - url: {sanitize(url)}") if not url.startswith(('http://', 'https://')): - return "Error: URL must start with http:// or https://" + return 'Error: URL must start with http:// or https://' + + headers = { + 'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.7', + 'Accept-Language': 'en-US,en;q=0.5', + } try: - client = get_webhook_client() - headers = { - 'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - } - response = await client.get(url, headers=headers, timeout=15.0, follow_redirects=True) + status, content_type, body = await _fetch_page(url, headers) + except ValueError as e: + logger.warning(f"fetch_url_tool blocked - {sanitize(str(e))}") + return f'Error: {sanitize(str(e))}' + except Exception as e: + logger.error(f"fetch_url_tool - error fetching {sanitize(url)}: {sanitize(str(e))}") + return f'Error: Failed to fetch the URL. {sanitize(str(e))}' - if response.status_code != 200: - logger.warning(f"fetch_url_tool - HTTP {response.status_code} for {url}") - return f"Error: Could not fetch page (HTTP {response.status_code})" + if status != 200: + logger.warning(f"fetch_url_tool - HTTP {status} for {sanitize(url)}") + return f'Error: Could not fetch page (HTTP {status})' - content_type = response.headers.get('content-type', '') - if 'text/html' in content_type or 'text/plain' in content_type or not content_type: - text = _html_to_text(response.text) - else: - return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read." + if not any(t in content_type for t in _PARSEABLE_TYPES) and content_type: + return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read." - if not text: - return "Error: Page appears to be empty or has no readable text content." + if not body: + return 'Error: Page appears to be empty or too large to read.' - if len(text) > _MAX_CONTENT_CHARS: - text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]' + text = _html_to_text(body) + if not text: + return 'Error: Page has no readable text content.' - logger.info(f"fetch_url_tool - fetched {len(text)} chars from {url}") - return f"Content from {url}:\n\n{text}" + if len(text) > _MAX_CONTENT_CHARS: + text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]' - except Exception as e: - logger.error(f"fetch_url_tool - error fetching {url}: {sanitize(str(e))}") - return f"Error: Failed to fetch the URL. {sanitize(str(e))}" + logger.info(f"fetch_url_tool - fetched {len(text)} chars from {sanitize(url)}") + return f'Content from {url}:\n\n{text}' From a56aa4295c892f9cdfcbb6a18aefed954510a680 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Wed, 27 May 2026 22:37:32 +0530 Subject: [PATCH 6/8] fix instruct Claude to always call fetch_url_tool when user shares a URL Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/agentic.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/utils/retrieval/agentic.py b/backend/utils/retrieval/agentic.py index 45218b741f0..00a31df2432 100644 --- a/backend/utils/retrieval/agentic.py +++ b/backend/utils/retrieval/agentic.py @@ -564,6 +564,14 @@ async def execute_agentic_chat_stream( IMPORTANT: Always search for and use these tools when relevant. Never tell the user you don't have access to an integration if a matching tool exists above. """ + # Instruct Claude to use fetch_url_tool for any direct URL in the conversation. + # Without this, Claude's built-in "I can't browse links" behavior takes over. + system_prompt += """ + + +You have fetch_url_tool available. When the user shares any URL (starting with http:// or https://), you MUST call fetch_url_tool to read its content before responding. Never say you cannot browse, visit, or read a URL. Always attempt to fetch it first. +""" + # Convert tools to Anthropic format (core = visible, app = defer_loading) tool_schemas, tool_registry = _convert_tools(core_tools, app_tools) From ae3410965333342597326d9d5ba107d8ca639d65 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Wed, 27 May 2026 22:44:01 +0530 Subject: [PATCH 7/8] fix extract meta/OG tags and add URL fetch instruction to system prompt Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/tools/web_tools.py | 108 ++++++++++++++++++++- 1 file changed, 104 insertions(+), 4 deletions(-) diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py index 1bd56c0947c..d30b6d6a564 100644 --- a/backend/utils/retrieval/tools/web_tools.py +++ b/backend/utils/retrieval/tools/web_tools.py @@ -4,6 +4,7 @@ import asyncio import ipaddress +import json import re import logging from html.parser import HTMLParser @@ -37,6 +38,99 @@ _PARSEABLE_TYPES = ('text/html', 'text/plain', 'application/xhtml+xml', 'application/xml') +# Fields to surface from JSON-LD structured data (schema.org), in display order. +_JSON_LD_FIELDS = [ + ('name', 'Title'), + ('headline', 'Headline'), + ('uploadDate', 'Upload date'), + ('datePublished', 'Published'), + ('dateModified', 'Modified'), + ('author', 'Author'), + ('description', 'Description'), + ('duration', 'Duration'), +] + + +def _extract_meta_tags(html: str) -> str: + """ + Extract page title, meta description, and Open Graph tags. + These are set even on fully JS-rendered pages (needed for SEO/social sharing) + and live inside , which the HTML stripper skips entirely. + """ + lines = [] + seen: set = set() + + def add(label: str, value: str) -> None: + value = value.strip() + if value and label not in seen: + seen.add(label) + lines.append(f'{label}: {value}') + + title_m = re.search(r']*>(.*?)', html, re.DOTALL | re.IGNORECASE) + if title_m: + add('Title', re.sub(r'<[^>]+>', '', title_m.group(1))) + + for m in re.finditer(r']+?)/?>', html, re.IGNORECASE): + attrs = m.group(1) + name_m = re.search(r'(?:name|property)=["\']([^"\']+)["\']', attrs, re.IGNORECASE) + content_m = re.search(r'content=["\']([^"\']*)["\']', attrs, re.IGNORECASE) + if not name_m or not content_m: + continue + name = name_m.group(1).lower().strip() + content = content_m.group(1).strip() + if not content: + continue + if name == 'description': + add('Description', content) + elif name == 'og:title': + add('Title', content) + elif name == 'og:description': + add('Description', content) + elif name == 'og:site_name': + add('Site', content) + elif name == 'og:type': + add('Type', content) + + return '\n'.join(lines) + + +def _extract_json_ld(html: str) -> str: + """ + Pull text from ', re.DOTALL | re.IGNORECASE + ) + lines = [] + for match in pattern.finditer(html): + try: + data = json.loads(match.group(1)) + except (json.JSONDecodeError, ValueError): + continue + + if isinstance(data, list): + items = data + else: + items = [data] + + for item in items: + if not isinstance(item, dict): + continue + for key, label in _JSON_LD_FIELDS: + val = item.get(key) + if not val: + continue + if isinstance(val, dict): + val = val.get('name') or val.get('@id') or str(val) + elif isinstance(val, list): + val = ', '.join(str(v.get('name', v) if isinstance(v, dict) else v) for v in val[:3]) + lines.append(f'{label}: {val}') + + return '\n'.join(lines) + def _is_private_ip(ip_str: str) -> bool: try: @@ -82,15 +176,21 @@ def handle_data(self, data): def _html_to_text(html: str) -> str: + meta = _extract_meta_tags(html) + structured = _extract_json_ld(html) + parser = _TextExtractor() try: parser.feed(html) except Exception: pass - text = ' '.join(parser.chunks) - text = re.sub(r' \n ', '\n', text) - text = re.sub(r'\n{3,}', '\n\n', text) - return text.strip() + body = ' '.join(parser.chunks) + body = re.sub(r' \n ', '\n', body) + body = re.sub(r'\n{3,}', '\n\n', body) + body = body.strip() + + parts = [p for p in (meta, structured, body) if p] + return '\n\n'.join(parts) async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]: From 992111ba58c3bdee1bc982f861f9ad8cb4d200c9 Mon Sep 17 00:00:00 2001 From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com> Date: Thu, 28 May 2026 07:31:12 +0530 Subject: [PATCH 8/8] fix resolve relative redirect URLs with urljoin to avoid ValueError on same-origin redirects Co-Authored-By: Claude Sonnet 4.6 --- backend/utils/retrieval/tools/web_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py index d30b6d6a564..507c74b633d 100644 --- a/backend/utils/retrieval/tools/web_tools.py +++ b/backend/utils/retrieval/tools/web_tools.py @@ -8,7 +8,7 @@ import re import logging from html.parser import HTMLParser -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin from langchain_core.tools import tool @@ -224,7 +224,7 @@ async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]: if status in (301, 302, 303, 307, 308): location = response.headers.get('location', '') - redirect_url = location + redirect_url = urljoin(url, location) else: cl_header = response.headers.get('content-length') if cl_header and int(cl_header) > _MAX_BODY_BYTES: