From 73a245472bb89514b13bd3df4e7fcf7ed4252d04 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Sat, 16 May 2026 12:19:19 +0530
Subject: [PATCH 1/8] feat add fetch_url_tool for reading specific web page
 URLs in AI chat

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/tools/web_tools.py | 109 +++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 backend/utils/retrieval/tools/web_tools.py

diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py
new file mode 100644
index 00000000000..d04a9f8d2f0
--- /dev/null
+++ b/backend/utils/retrieval/tools/web_tools.py
@@ -0,0 +1,109 @@
+"""
+Tools for fetching content from specific URLs.
+"""
+
+import re
+import logging
+from html.parser import HTMLParser
+from langchain_core.tools import tool
+from utils.http_client import get_webhook_client
+from utils.log_sanitizer import sanitize
+
+logger = logging.getLogger(__name__)
+
+_SKIP_TAGS = {'script', 'style', 'noscript', 'head', 'meta', 'link', 'svg', 'iframe', 'nav', 'footer'}
+_MAX_CONTENT_CHARS = 8000
+
+
+_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'}
+
+
+class _TextExtractor(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self._skip_depth = 0
+        self.chunks = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag in _SKIP_TAGS:
+            self._skip_depth += 1
+        elif tag in _BLOCK_TAGS and self._skip_depth == 0 and self.chunks:
+            self.chunks.append('\n')
+
+    def handle_endtag(self, tag):
+        if tag in _SKIP_TAGS and self._skip_depth > 0:
+            self._skip_depth -= 1
+
+    def handle_data(self, data):
+        if self._skip_depth == 0:
+            text = data.strip()
+            if text:
+                self.chunks.append(text)
+
+
+def _html_to_text(html: str) -> str:
+    parser = _TextExtractor()
+    try:
+        parser.feed(html)
+    except Exception:
+        pass
+    text = ' '.join(parser.chunks)
+    text = re.sub(r' \n ', '\n', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text.strip()
+
+
+@tool
+async def fetch_url_tool(url: str) -> str:
+    """
+    Fetch and read the content of a specific web page URL.
+
+    Use this tool when:
+    - The user shares a direct URL and asks you to read, summarize, or analyze it
+    - The user says "check this link", "what does this page say", "summarize this article" with a URL
+    - You need to read the actual content at a specific web address
+
+    DO NOT use this tool for general web searches — use web_search instead.
+
+    Args:
+        url: The full URL to fetch (must start with http:// or https://)
+
+    Returns:
+        The readable text content of the page (up to 8000 characters)
+    """
+    logger.info(f"fetch_url_tool called - url: {url}")
+
+    if not url.startswith(('http://', 'https://')):
+        return "Error: URL must start with http:// or https://"
+
+    try:
+        client = get_webhook_client()
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+        }
+        response = await client.get(url, headers=headers, timeout=15.0, follow_redirects=True)
+
+        if response.status_code != 200:
+            logger.warning(f"fetch_url_tool - HTTP {response.status_code} for {url}")
+            return f"Error: Could not fetch page (HTTP {response.status_code})"
+
+        content_type = response.headers.get('content-type', '')
+        if 'text/html' in content_type or 'text/plain' in content_type or not content_type:
+            text = _html_to_text(response.text)
+        else:
+            return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read."
+
+        if not text:
+            return "Error: Page appears to be empty or has no readable text content."
+
+        if len(text) > _MAX_CONTENT_CHARS:
+            text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]'
+
+        logger.info(f"fetch_url_tool - fetched {len(text)} chars from {url}")
+        return f"Content from {url}:\n\n{text}"
+
+    except Exception as e:
+        logger.error(f"fetch_url_tool - error fetching {url}: {sanitize(str(e))}")
+        return f"Error: Failed to fetch the URL. {sanitize(str(e))}"

From eae612dc9151a4719bc70fef727c8c7961707bb8 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Sat, 16 May 2026 12:19:23 +0530
Subject: [PATCH 2/8] feat export fetch_url_tool from tools __init__

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/tools/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/utils/retrieval/tools/__init__.py b/backend/utils/retrieval/tools/__init__.py
index f9bc2f6cccc..d511948a6ef 100644
--- a/backend/utils/retrieval/tools/__init__.py
+++ b/backend/utils/retrieval/tools/__init__.py
@@ -53,6 +53,9 @@
 from .preference_tools import (
     save_user_preference_tool,
 )
+from .web_tools import (
+    fetch_url_tool,
+)
 
 __all__ = [
     'get_conversations_tool',
@@ -79,4 +82,5 @@
     'get_screen_activity_tool',
     'search_screen_activity_tool',
     'save_user_preference_tool',
+    'fetch_url_tool',
 ]

From 81a187d72b48367f4f94a808397e83c6eaf70711 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Sat, 16 May 2026 12:19:26 +0530
Subject: [PATCH 3/8] feat wire fetch_url_tool into CORE_TOOLS so AI chat can
 read direct URLs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/agentic.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/utils/retrieval/agentic.py b/backend/utils/retrieval/agentic.py
index 84c3697ffb4..45218b741f0 100644
--- a/backend/utils/retrieval/agentic.py
+++ b/backend/utils/retrieval/agentic.py
@@ -44,6 +44,7 @@
     get_screen_activity_tool,
     search_screen_activity_tool,
     save_user_preference_tool,
+    fetch_url_tool,
 )
 from utils.retrieval.tools.app_tools import load_app_tools, get_tool_status_message
 from utils.retrieval.safety import AgentSafetyGuard, SafetyGuardError
@@ -96,6 +97,7 @@ def decorator(func):
     get_screen_activity_tool,
     search_screen_activity_tool,
     save_user_preference_tool,
+    fetch_url_tool,
 ]
 
 # Standard tool names (used to detect app tools by exclusion)
@@ -133,6 +135,7 @@ def get_tool_display_name(tool_name: str, tool_obj: Optional[Any] = None) -> str
         'get_screen_activity_tool': 'Checking screen activity',
         'search_screen_activity_tool': 'Searching screen activity',
         'save_user_preference_tool': 'Saving preference',
+        'fetch_url_tool': 'Reading page',
     }
 
     if tool_name in tool_display_map:

From 34a8ef963f44ad0cf9cbd0faf614cf174da093a6 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Sat, 16 May 2026 20:41:00 +0530
Subject: [PATCH 4/8] feat add get_web_fetch_client isolated from webhook pool
 for URL fetches

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/http_client.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/backend/utils/http_client.py b/backend/utils/http_client.py
index 223d7dfdacf..e5ca7fd1022 100644
--- a/backend/utils/http_client.py
+++ b/backend/utils/http_client.py
@@ -230,6 +230,7 @@ def get_tts_semaphore() -> asyncio.Semaphore:
 _auth_client: httpx.AsyncClient | None = None
 _stt_client: httpx.AsyncClient | None = None
 _tts_client: httpx.AsyncClient | None = None
+_web_fetch_client: httpx.AsyncClient | None = None
 
 
 def get_webhook_client() -> httpx.AsyncClient:
@@ -299,10 +300,25 @@ def get_tts_client() -> httpx.AsyncClient:
     return _tts_client
 
 
+def get_web_fetch_client() -> httpx.AsyncClient:
+    """Return a shared async HTTP client for user-initiated URL fetches.
+
+    Isolated from the webhook pool so slow/stalled external pages don't
+    compete with partner webhook delivery slots.
+    """
+    global _web_fetch_client
+    if _web_fetch_client is None:
+        _web_fetch_client = httpx.AsyncClient(
+            timeout=httpx.Timeout(15.0, connect=5.0),
+            limits=httpx.Limits(max_connections=16, max_keepalive_connections=4),
+        )
+    return _web_fetch_client
+
+
 async def close_all_clients():
     """Close all shared HTTP clients. Call at app shutdown."""
-    global _webhook_client, _maps_client, _auth_client, _stt_client, _tts_client
-    for client in (_webhook_client, _maps_client, _auth_client, _stt_client, _tts_client):
+    global _webhook_client, _maps_client, _auth_client, _stt_client, _tts_client, _web_fetch_client
+    for client in (_webhook_client, _maps_client, _auth_client, _stt_client, _tts_client, _web_fetch_client):
         if client is not None:
             try:
                 await client.aclose()
@@ -313,6 +329,7 @@ async def close_all_clients():
     _auth_client = None
     _tts_client = None
     _stt_client = None
+    _web_fetch_client = None
     # Reset stateful registries
     _semaphores.clear()
     _webhook_circuit_breakers.clear()

From 8773b8394a4148ebb45c4793609aa831914c5e0c Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Sat, 16 May 2026 20:41:08 +0530
Subject: [PATCH 5/8] fix fetch_url_tool SSRF guard, stream body cap, sanitize
 URL logs, fix content-type check

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/tools/web_tools.py | 155 +++++++++++++++++----
 1 file changed, 127 insertions(+), 28 deletions(-)

diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py
index d04a9f8d2f0..1bd56c0947c 100644
--- a/backend/utils/retrieval/tools/web_tools.py
+++ b/backend/utils/retrieval/tools/web_tools.py
@@ -2,20 +2,60 @@
 Tools for fetching content from specific URLs.
 """
 
+import asyncio
+import ipaddress
 import re
 import logging
 from html.parser import HTMLParser
+from urllib.parse import urlparse
+
 from langchain_core.tools import tool
-from utils.http_client import get_webhook_client
+
+from utils.http_client import get_web_fetch_client
 from utils.log_sanitizer import sanitize
 
 logger = logging.getLogger(__name__)
 
 _SKIP_TAGS = {'script', 'style', 'noscript', 'head', 'meta', 'link', 'svg', 'iframe', 'nav', 'footer'}
+_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'}
 _MAX_CONTENT_CHARS = 8000
+_MAX_BODY_BYTES = 512 * 1024  # cap before HTML parsing
+_MAX_REDIRECTS = 5
+
+# RFC-1918, loopback, link-local (incl. cloud metadata), carrier-grade NAT, IPv6 private
+_PRIVATE_NETWORKS = [
+    ipaddress.ip_network('127.0.0.0/8'),
+    ipaddress.ip_network('10.0.0.0/8'),
+    ipaddress.ip_network('172.16.0.0/12'),
+    ipaddress.ip_network('192.168.0.0/16'),
+    ipaddress.ip_network('169.254.0.0/16'),
+    ipaddress.ip_network('100.64.0.0/10'),
+    ipaddress.ip_network('::1/128'),
+    ipaddress.ip_network('fe80::/10'),
+    ipaddress.ip_network('fc00::/7'),
+]
+
+_PARSEABLE_TYPES = ('text/html', 'text/plain', 'application/xhtml+xml', 'application/xml')
+
+
+def _is_private_ip(ip_str: str) -> bool:
+    try:
+        ip = ipaddress.ip_address(ip_str)
+        return any(ip in net for net in _PRIVATE_NETWORKS)
+    except ValueError:
+        return True  # unparseable → treat as blocked
 
 
-_BLOCK_TAGS = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr', 'blockquote', 'section', 'article'}
+async def _hostname_is_public(hostname: str) -> bool:
+    """Resolve hostname and return True only if every IP is a public address."""
+    try:
+        loop = asyncio.get_running_loop()
+        results = await loop.getaddrinfo(hostname, None)
+        if not results:
+            return False
+        return not any(_is_private_ip(r[4][0]) for r in results)
+    except Exception:
+        return False
 
 
 class _TextExtractor(HTMLParser):
@@ -53,6 +93,62 @@ def _html_to_text(html: str) -> str:
     return text.strip()
 
 
+async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]:
+    """
+    Fetch *url* with SSRF guard, manual redirect following, and a body-size cap.
+    Returns (status_code, content_type, body_text).
+    Raises ValueError on SSRF/redirect violations.
+    """
+    client = get_web_fetch_client()
+
+    for _ in range(_MAX_REDIRECTS + 1):
+        if not url.startswith(('http://', 'https://')):
+            raise ValueError('Redirect target must use http:// or https://')
+
+        parsed = urlparse(url)
+        hostname = parsed.hostname or ''
+        if not hostname:
+            raise ValueError('Invalid URL: no hostname')
+
+        if not await _hostname_is_public(hostname):
+            raise ValueError('URL resolves to a private or reserved address')
+
+        redirect_url = None
+        status = 0
+        content_type = ''
+        body_text = ''
+
+        async with client.stream('GET', url, headers=headers, follow_redirects=False) as response:
+            status = response.status_code
+            content_type = response.headers.get('content-type', '')
+
+            if status in (301, 302, 303, 307, 308):
+                location = response.headers.get('location', '')
+                redirect_url = location
+            else:
+                cl_header = response.headers.get('content-length')
+                if cl_header and int(cl_header) > _MAX_BODY_BYTES:
+                    return status, content_type, ''
+
+                chunks = []
+                total = 0
+                async for chunk in response.aiter_bytes(chunk_size=8192):
+                    total += len(chunk)
+                    chunks.append(chunk)
+                    if total >= _MAX_BODY_BYTES:
+                        break
+
+                body_text = b''.join(chunks).decode('utf-8', errors='replace')
+
+        if redirect_url is not None:
+            url = redirect_url
+            continue
+
+        return status, content_type, body_text
+
+    raise ValueError('Too many redirects')
+
+
 @tool
 async def fetch_url_tool(url: str) -> str:
     """
@@ -71,39 +167,42 @@ async def fetch_url_tool(url: str) -> str:
     Returns:
         The readable text content of the page (up to 8000 characters)
     """
-    logger.info(f"fetch_url_tool called - url: {url}")
+    logger.info(f"fetch_url_tool called - url: {sanitize(url)}")
 
     if not url.startswith(('http://', 'https://')):
-        return "Error: URL must start with http:// or https://"
+        return 'Error: URL must start with http:// or https://'
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.5',
+    }
 
     try:
-        client = get_webhook_client()
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (compatible; Omi-AI-Bot/1.0)',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-        }
-        response = await client.get(url, headers=headers, timeout=15.0, follow_redirects=True)
+        status, content_type, body = await _fetch_page(url, headers)
+    except ValueError as e:
+        logger.warning(f"fetch_url_tool blocked - {sanitize(str(e))}")
+        return f'Error: {sanitize(str(e))}'
+    except Exception as e:
+        logger.error(f"fetch_url_tool - error fetching {sanitize(url)}: {sanitize(str(e))}")
+        return f'Error: Failed to fetch the URL. {sanitize(str(e))}'
 
-        if response.status_code != 200:
-            logger.warning(f"fetch_url_tool - HTTP {response.status_code} for {url}")
-            return f"Error: Could not fetch page (HTTP {response.status_code})"
+    if status != 200:
+        logger.warning(f"fetch_url_tool - HTTP {status} for {sanitize(url)}")
+        return f'Error: Could not fetch page (HTTP {status})'
 
-        content_type = response.headers.get('content-type', '')
-        if 'text/html' in content_type or 'text/plain' in content_type or not content_type:
-            text = _html_to_text(response.text)
-        else:
-            return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read."
+    if not any(t in content_type for t in _PARSEABLE_TYPES) and content_type:
+        return f"Error: Unsupported content type '{content_type}'. Only HTML and plain text pages can be read."
 
-        if not text:
-            return "Error: Page appears to be empty or has no readable text content."
+    if not body:
+        return 'Error: Page appears to be empty or too large to read.'
 
-        if len(text) > _MAX_CONTENT_CHARS:
-            text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]'
+    text = _html_to_text(body)
+    if not text:
+        return 'Error: Page has no readable text content.'
 
-        logger.info(f"fetch_url_tool - fetched {len(text)} chars from {url}")
-        return f"Content from {url}:\n\n{text}"
+    if len(text) > _MAX_CONTENT_CHARS:
+        text = text[:_MAX_CONTENT_CHARS] + f'\n\n[Content truncated — {len(text)} total characters]'
 
-    except Exception as e:
-        logger.error(f"fetch_url_tool - error fetching {url}: {sanitize(str(e))}")
-        return f"Error: Failed to fetch the URL. {sanitize(str(e))}"
+    logger.info(f"fetch_url_tool - fetched {len(text)} chars from {sanitize(url)}")
+    return f'Content from {url}:\n\n{text}'

From a56aa4295c892f9cdfcbb6a18aefed954510a680 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Wed, 27 May 2026 22:37:32 +0530
Subject: [PATCH 6/8] fix instruct Claude to always call fetch_url_tool when
 user shares a URL

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/agentic.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/backend/utils/retrieval/agentic.py b/backend/utils/retrieval/agentic.py
index 45218b741f0..00a31df2432 100644
--- a/backend/utils/retrieval/agentic.py
+++ b/backend/utils/retrieval/agentic.py
@@ -564,6 +564,14 @@ async def execute_agentic_chat_stream(
 IMPORTANT: Always search for and use these tools when relevant. Never tell the user you don't have access to an integration if a matching tool exists above.
 </available_app_tools>"""
 
+    # Instruct Claude to use fetch_url_tool for any direct URL in the conversation.
+    # Without this, Claude's built-in "I can't browse links" behavior takes over.
+    system_prompt += """
+
+<url_fetching_instructions>
+You have fetch_url_tool available. When the user shares any URL (starting with http:// or https://), you MUST call fetch_url_tool to read its content before responding. Never say you cannot browse, visit, or read a URL. Always attempt to fetch it first.
+</url_fetching_instructions>"""
+
     # Convert tools to Anthropic format (core = visible, app = defer_loading)
     tool_schemas, tool_registry = _convert_tools(core_tools, app_tools)
 

From ae3410965333342597326d9d5ba107d8ca639d65 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Wed, 27 May 2026 22:44:01 +0530
Subject: [PATCH 7/8] fix extract meta/OG tags and add URL fetch instruction to
 system prompt

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/tools/web_tools.py | 108 ++++++++++++++++++++-
 1 file changed, 104 insertions(+), 4 deletions(-)

diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py
index 1bd56c0947c..d30b6d6a564 100644
--- a/backend/utils/retrieval/tools/web_tools.py
+++ b/backend/utils/retrieval/tools/web_tools.py
@@ -4,6 +4,7 @@
 
 import asyncio
 import ipaddress
+import json
 import re
 import logging
 from html.parser import HTMLParser
@@ -37,6 +38,99 @@
 
 _PARSEABLE_TYPES = ('text/html', 'text/plain', 'application/xhtml+xml', 'application/xml')
 
+# Fields to surface from JSON-LD structured data (schema.org), in display order.
+_JSON_LD_FIELDS = [
+    ('name', 'Title'),
+    ('headline', 'Headline'),
+    ('uploadDate', 'Upload date'),
+    ('datePublished', 'Published'),
+    ('dateModified', 'Modified'),
+    ('author', 'Author'),
+    ('description', 'Description'),
+    ('duration', 'Duration'),
+]
+
+
+def _extract_meta_tags(html: str) -> str:
+    """
+    Extract page title, meta description, and Open Graph tags.
+    These are set even on fully JS-rendered pages (needed for SEO/social sharing)
+    and live inside <head>, which the HTML stripper skips entirely.
+    """
+    lines = []
+    seen: set = set()
+
+    def add(label: str, value: str) -> None:
+        value = value.strip()
+        if value and label not in seen:
+            seen.add(label)
+            lines.append(f'{label}: {value}')
+
+    title_m = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
+    if title_m:
+        add('Title', re.sub(r'<[^>]+>', '', title_m.group(1)))
+
+    for m in re.finditer(r'<meta\s+([^>]+?)/?>', html, re.IGNORECASE):
+        attrs = m.group(1)
+        name_m = re.search(r'(?:name|property)=["\']([^"\']+)["\']', attrs, re.IGNORECASE)
+        content_m = re.search(r'content=["\']([^"\']*)["\']', attrs, re.IGNORECASE)
+        if not name_m or not content_m:
+            continue
+        name = name_m.group(1).lower().strip()
+        content = content_m.group(1).strip()
+        if not content:
+            continue
+        if name == 'description':
+            add('Description', content)
+        elif name == 'og:title':
+            add('Title', content)
+        elif name == 'og:description':
+            add('Description', content)
+        elif name == 'og:site_name':
+            add('Site', content)
+        elif name == 'og:type':
+            add('Type', content)
+
+    return '\n'.join(lines)
+
+
+def _extract_json_ld(html: str) -> str:
+    """
+    Pull text from <script type="application/ld+json"> blocks.
+    Many JS-rendered pages (YouTube, articles) embed their canonical metadata
+    here even when the visible DOM is empty without JS execution.
+    Returns a formatted multi-line string, or '' if nothing useful is found.
+    """
+    pattern = re.compile(
+        r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.DOTALL | re.IGNORECASE
+    )
+    lines = []
+    for match in pattern.finditer(html):
+        try:
+            data = json.loads(match.group(1))
+        except (json.JSONDecodeError, ValueError):
+            continue
+
+        if isinstance(data, list):
+            items = data
+        else:
+            items = [data]
+
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            for key, label in _JSON_LD_FIELDS:
+                val = item.get(key)
+                if not val:
+                    continue
+                if isinstance(val, dict):
+                    val = val.get('name') or val.get('@id') or str(val)
+                elif isinstance(val, list):
+                    val = ', '.join(str(v.get('name', v) if isinstance(v, dict) else v) for v in val[:3])
+                lines.append(f'{label}: {val}')
+
+    return '\n'.join(lines)
+
 
 def _is_private_ip(ip_str: str) -> bool:
     try:
@@ -82,15 +176,21 @@ def handle_data(self, data):
 
 
 def _html_to_text(html: str) -> str:
+    meta = _extract_meta_tags(html)
+    structured = _extract_json_ld(html)
+
     parser = _TextExtractor()
     try:
         parser.feed(html)
     except Exception:
         pass
-    text = ' '.join(parser.chunks)
-    text = re.sub(r' \n ', '\n', text)
-    text = re.sub(r'\n{3,}', '\n\n', text)
-    return text.strip()
+    body = ' '.join(parser.chunks)
+    body = re.sub(r' \n ', '\n', body)
+    body = re.sub(r'\n{3,}', '\n\n', body)
+    body = body.strip()
+
+    parts = [p for p in (meta, structured, body) if p]
+    return '\n\n'.join(parts)
 
 
 async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]:

From 992111ba58c3bdee1bc982f861f9ad8cb4d200c9 Mon Sep 17 00:00:00 2001
From: Krushna Kanta Rout <129386740+krushnarout@users.noreply.github.com>
Date: Thu, 28 May 2026 07:31:12 +0530
Subject: [PATCH 8/8] fix resolve relative redirect URLs with urljoin to avoid
 ValueError on same-origin redirects

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/utils/retrieval/tools/web_tools.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/utils/retrieval/tools/web_tools.py b/backend/utils/retrieval/tools/web_tools.py
index d30b6d6a564..507c74b633d 100644
--- a/backend/utils/retrieval/tools/web_tools.py
+++ b/backend/utils/retrieval/tools/web_tools.py
@@ -8,7 +8,7 @@
 import re
 import logging
 from html.parser import HTMLParser
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin
 
 from langchain_core.tools import tool
 
@@ -224,7 +224,7 @@ async def _fetch_page(url: str, headers: dict) -> tuple[int, str, str]:
 
             if status in (301, 302, 303, 307, 308):
                 location = response.headers.get('location', '')
-                redirect_url = location
+                redirect_url = urljoin(url, location)
             else:
                 cl_header = response.headers.get('content-length')
                 if cl_header and int(cl_header) > _MAX_BODY_BYTES: