diff --git a/backend/utils/http_client.py b/backend/utils/http_client.py
index 223d7dfdacf..e5ca7fd1022 100644
--- a/backend/utils/http_client.py
+++ b/backend/utils/http_client.py
@@ -230,6 +230,7 @@ def get_tts_semaphore() -> asyncio.Semaphore:
_auth_client: httpx.AsyncClient | None = None
_stt_client: httpx.AsyncClient | None = None
_tts_client: httpx.AsyncClient | None = None
+_web_fetch_client: httpx.AsyncClient | None = None
def get_webhook_client() -> httpx.AsyncClient:
@@ -299,10 +300,25 @@ def get_tts_client() -> httpx.AsyncClient:
return _tts_client
+def get_web_fetch_client() -> httpx.AsyncClient:
+ """Return a shared async HTTP client for user-initiated URL fetches.
+
+ Isolated from the webhook pool so slow/stalled external pages don't
+ compete with partner webhook delivery slots.
+ """
+ global _web_fetch_client
+ if _web_fetch_client is None:
+ _web_fetch_client = httpx.AsyncClient(
+ timeout=httpx.Timeout(15.0, connect=5.0),
+ limits=httpx.Limits(max_connections=16, max_keepalive_connections=4),
+ )
+ return _web_fetch_client
+
+
async def close_all_clients():
"""Close all shared HTTP clients. Call at app shutdown."""
- global _webhook_client, _maps_client, _auth_client, _stt_client, _tts_client
- for client in (_webhook_client, _maps_client, _auth_client, _stt_client, _tts_client):
+ global _webhook_client, _maps_client, _auth_client, _stt_client, _tts_client, _web_fetch_client
+ for client in (_webhook_client, _maps_client, _auth_client, _stt_client, _tts_client, _web_fetch_client):
if client is not None:
try:
await client.aclose()
@@ -313,6 +329,7 @@ async def close_all_clients():
_auth_client = None
_tts_client = None
_stt_client = None
+ _web_fetch_client = None
# Reset stateful registries
_semaphores.clear()
_webhook_circuit_breakers.clear()
diff --git a/backend/utils/retrieval/agentic.py b/backend/utils/retrieval/agentic.py
index 84c3697ffb4..00a31df2432 100644
--- a/backend/utils/retrieval/agentic.py
+++ b/backend/utils/retrieval/agentic.py
@@ -44,6 +44,7 @@
get_screen_activity_tool,
search_screen_activity_tool,
save_user_preference_tool,
+ fetch_url_tool,
)
from utils.retrieval.tools.app_tools import load_app_tools, get_tool_status_message
from utils.retrieval.safety import AgentSafetyGuard, SafetyGuardError
@@ -96,6 +97,7 @@ def decorator(func):
get_screen_activity_tool,
search_screen_activity_tool,
save_user_preference_tool,
+ fetch_url_tool,
]
# Standard tool names (used to detect app tools by exclusion)
@@ -133,6 +135,7 @@ def get_tool_display_name(tool_name: str, tool_obj: Optional[Any] = None) -> str
'get_screen_activity_tool': 'Checking screen activity',
'search_screen_activity_tool': 'Searching screen activity',
'save_user_preference_tool': 'Saving preference',
+ 'fetch_url_tool': 'Reading page',
}
if tool_name in tool_display_map:
@@ -561,6 +564,14 @@ async def execute_agentic_chat_stream(
IMPORTANT: Always search for and use these tools when relevant. Never tell the user you don't have access to an integration if a matching tool exists above.
"""
+ # Instruct Claude to use fetch_url_tool for any direct URL in the conversation.
+ # Without this, Claude's built-in "I can't browse links" behavior takes over.
+ system_prompt += """
+
+
+You have fetch_url_tool available. When the user shares any URL (starting with http:// or https://), you MUST call fetch_url_tool to read its content before responding. Never say you cannot browse, visit, or read a URL. Always attempt to fetch it first.
+"""
+
# Convert tools to Anthropic format (core = visible, app = defer_loading)
tool_schemas, tool_registry = _convert_tools(core_tools, app_tools)
diff --git a/backend/utils/retrieval/tools/__init__.py b/backend/utils/retrieval/tools/__init__.py
index f9bc2f6cccc..d511948a6ef 100644
--- a/backend/utils/retrieval/tools/__init__.py
+++ b/backend/utils/retrieval/tools/__init__.py
@@ -53,6 +53,9 @@
from .preference_tools import (
save_user_preference_tool,
)
+from .web_tools import (
+ fetch_url_tool,
+)
__all__ = [
'get_conversations_tool',
@@ -79,4 +82,5 @@
'get_screen_activity_tool',
'search_screen_activity_tool',
'save_user_preference_tool',
+ 'fetch_url_tool',
]
diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py
new file mode 100644
index 00000000000..507c74b633d
--- /dev/null
+++ b/backend/utils/retrieval/tools/web_tools.py
@@ -0,0 +1,308 @@
+"""
+Tools for fetching content from specific URLs.
+"""
+
+import asyncio
+import ipaddress
+import json
+import re
+import logging
+from html.parser import HTMLParser
+from urllib.parse import urlparse, urljoin
+
+from langchain_core.tools import tool
+
+from utils.http_client import get_web_fetch_client
+from utils.log_sanitizer import sanitize
+
+logger = logging.getLogger(__name__)
+
+_SKIP_TAGS = {'script', 'style', 'noscript', 'head', 'meta', 'link', 'svg', 'iframe', 'nav', 'footer'}
+_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'}
+_MAX_CONTENT_CHARS = 8000
+_MAX_BODY_BYTES = 512 * 1024 # cap before HTML parsing
+_MAX_REDIRECTS = 5
+
+# RFC-1918, loopback, link-local (incl. cloud metadata), carrier-grade NAT, IPv6 private
+_PRIVATE_NETWORKS = [
+ ipaddress.ip_network('127.0.0.0/8'),
+ ipaddress.ip_network('10.0.0.0/8'),
+ ipaddress.ip_network('172.16.0.0/12'),
+ ipaddress.ip_network('192.168.0.0/16'),
+ ipaddress.ip_network('169.254.0.0/16'),
+ ipaddress.ip_network('100.64.0.0/10'),
+ ipaddress.ip_network('::1/128'),
+ ipaddress.ip_network('fe80::/10'),
+ ipaddress.ip_network('fc00::/7'),
+]
+
+_PARSEABLE_TYPES = ('text/html', 'text/plain', 'application/xhtml+xml', 'application/xml')
+
+# Fields to surface from JSON-LD structured data (schema.org), in display order.
+_JSON_LD_FIELDS = [
+ ('name', 'Title'),
+ ('headline', 'Headline'),
+ ('uploadDate', 'Upload date'),
+ ('datePublished', 'Published'),
+ ('dateModified', 'Modified'),
+ ('author', 'Author'),
+ ('description', 'Description'),
+ ('duration', 'Duration'),
+]
+
+
+def _extract_meta_tags(html: str) -> str:
+ """
+ Extract page title, meta description, and Open Graph tags.
+ These are set even on fully JS-rendered pages (needed for SEO/social sharing)
+ and live inside
, which the HTML stripper skips entirely.
+ """
+ lines = []
+ seen: set = set()
+
+ def add(label: str, value: str) -> None:
+ value = value.strip()
+ if value and label not in seen:
+ seen.add(label)
+ lines.append(f'{label}: {value}')
+
+ title_m = re.search(r']*>(.*?)', html, re.DOTALL | re.IGNORECASE)
+ if title_m:
+ add('Title', re.sub(r'<[^>]+>', '', title_m.group(1)))
+
+ for m in re.finditer(r']+?)/?>', html, re.IGNORECASE):
+ attrs = m.group(1)
+ name_m = re.search(r'(?:name|property)=["\']([^"\']+)["\']', attrs, re.IGNORECASE)
+ content_m = re.search(r'content=["\']([^"\']*)["\']', attrs, re.IGNORECASE)
+ if not name_m or not content_m:
+ continue
+ name = name_m.group(1).lower().strip()
+ content = content_m.group(1).strip()
+ if not content:
+ continue
+ if name == 'description':
+ add('Description', content)
+ elif name == 'og:title':
+ add('Title', content)
+ elif name == 'og:description':
+ add('Description', content)
+ elif name == 'og:site_name':
+ add('Site', content)
+ elif name == 'og:type':
+ add('Type', content)
+
+ return '\n'.join(lines)
+
+
+def _extract_json_ld(html: str) -> str:
+ """
+ Pull text from ', re.DOTALL | re.IGNORECASE
+ )
+ lines = []
+ for match in pattern.finditer(html):
+ try:
+ data = json.loads(match.group(1))
+ except (json.JSONDecodeError, ValueError):
+ continue
+
+ if isinstance(data, list):
+ items = data
+ else:
+ items = [data]
+
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ for key, label in _JSON_LD_FIELDS:
+ val = item.get(key)
+ if not val:
+ continue
+ if isinstance(val, dict):
+ val = val.get('name') or val.get('@id') or str(val)
+ elif isinstance(val, list):
+ val = ', '.join(str(v.get('name', v) if isinstance(v, dict) else v) for v in val[:3])
+ lines.append(f'{label}: {val}')
+
+ return '\n'.join(lines)
+
+
+def _is_private_ip(ip_str: str) -> bool:
+ try:
+ ip = ipaddress.ip_address(ip_str)
+ return any(ip in net for net in _PRIVATE_NETWORKS)
+ except ValueError:
+ return True # unparseable → treat as blocked
+
+
+async def _hostname_is_public(hostname: str) -> bool:
+ """Resolve hostname and return True only if every IP is a public address."""
+ try:
+ loop = asyncio.get_running_loop()
+ results = await loop.getaddrinfo(hostname, None)
+ if not results:
+ return False
+ return not any(_is_private_ip(r[4][0]) for r in results)
+ except Exception:
+ return False
+
+
+class _TextExtractor(HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self._skip_depth = 0
+ self.chunks = []
+
+ def handle_starttag(self, tag, attrs):
+ if tag in _SKIP_TAGS:
+ self._skip_depth += 1
+ elif tag in _BLOCK_TAGS and self._skip_depth == 0 and self.chunks:
+ self.chunks.append('\n')
+
+ def handle_endtag(self, tag):
+ if tag in _SKIP_TAGS and self._skip_depth > 0:
+ self._skip_depth -= 1
+
+ def handle_data(self, data):
+ if self._skip_depth == 0:
+ text = data.strip()
+ if text:
+ self.chunks.append(text)
+
+
+def _html_to_text(html: str) -> str:
+ meta = _extract_meta_tags(html)
+ structured = _extract_json_ld(html)
+
+ parser = _TextExtractor()
+ try:
+ parser.feed(html)
+ except Exception:
+ pass
+ body = ' '.join(parser.chunks)
+ body = re.sub(r' \n ', '\n', body)
+ body = re.sub(r'\n{3,}', '\n\n', body)
+ body = body.strip()
+
+ parts = [p for p in (meta, structured, body) if p]
+ return '\n\n'.join(parts)
+
+
+async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]:
+ """
+ Fetch *url* with SSRF guard, manual redirect following, and a body-size cap.
+ Returns (status_code, content_type, body_text).
+ Raises ValueError on SSRF/redirect violations.
+ """
+ client = get_web_fetch_client()
+
+ for _ in range(_MAX_REDIRECTS + 1):
+ if not url.startswith(('http://', 'https://')):
+ raise ValueError('Redirect target must use http:// or https://')
+
+ parsed = urlparse(url)
+ hostname = parsed.hostname or ''
+ if not hostname:
+ raise ValueError('Invalid URL: no hostname')
+
+ if not await _hostname_is_public(hostname):
+ raise ValueError('URL resolves to a private or reserved address')
+
+ redirect_url = None
+ status = 0
+ content_type = ''
+ body_text = ''
+
+ async with client.stream('GET', url, headers=headers, follow_redirects=False) as response:
+ status = response.status_code
+ content_type = response.headers.get('content-type', '')
+
+ if status in (301, 302, 303, 307, 308):
+ location = response.headers.get('location', '')
+ redirect_url = urljoin(url, location)
+ else:
+ cl_header = response.headers.get('content-length')
+ if cl_header and int(cl_header) > _MAX_BODY_BYTES:
+ return status, content_type, ''
+
+ chunks = []
+ total = 0
+ async for chunk in response.aiter_bytes(chunk_size=8192):
+ total += len(chunk)
+ chunks.append(chunk)
+ if total >= _MAX_BODY_BYTES:
+ break
+
+ body_text = b''.join(chunks).decode('utf-8', errors='replace')
+
+ if redirect_url is not None:
+ url = redirect_url
+ continue
+
+ return status, content_type, body_text
+
+ raise ValueError('Too many redirects')
+
+
+@tool
+async def fetch_url_tool(url: str) -> str:
+ """
+ Fetch and read the content of a specific web page URL.
+
+ Use this tool when:
+ - The user shares a direct URL and asks you to read, summarize, or analyze it
+ - The user says "check this link", "what does this page say", "summarize this article" with a URL
+ - You need to read the actual content at a specific web address
+
+ DO NOT use this tool for general web searches — use web_search instead.
+
+ Args:
+ url: The full URL to fetch (must start with http:// or https://)
+
+ Returns:
+ The readable text content of the page (up to 8000 characters)
+ """
+ logger.info(f"fetch_url_tool called - url: {sanitize(url)}")
+
+ if not url.startswith(('http://', 'https://')):
+ return 'Error: URL must start with http:// or https://'
+
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.7',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ }
+
+ try:
+ status, content_type, body = await _fetch_page(url, headers)
+ except ValueError as e:
+ logger.warning(f"fetch_url_tool blocked - {sanitize(str(e))}")
+ return f'Error: {sanitize(str(e))}'
+ except Exception as e:
+ logger.error(f"fetch_url_tool - error fetching {sanitize(url)}: {sanitize(str(e))}")
+ return f'Error: Failed to fetch the URL. {sanitize(str(e))}'
+
+ if status != 200:
+ logger.warning(f"fetch_url_tool - HTTP {status} for {sanitize(url)}")
+ return f'Error: Could not fetch page (HTTP {status})'
+
+ if not any(t in content_type for t in _PARSEABLE_TYPES) and content_type:
+ return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read."
+
+ if not body:
+ return 'Error: Page appears to be empty or too large to read.'
+
+ text = _html_to_text(body)
+ if not text:
+ return 'Error: Page has no readable text content.'
+
+ if len(text) > _MAX_CONTENT_CHARS:
+ text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]'
+
+ logger.info(f"fetch_url_tool - fetched {len(text)} chars from {sanitize(url)}")
+ return f'Content from {url}:\n\n{text}'