diff --git a/README.md b/README.md index 01cc552..43d1491 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ With mini-browser: clean text → 800 tokens → $ **mini-browser** sits between your AI agent and the web: -1. **Searches** — DuckDuckGo, no API key required, spam filtered, recency-aware +1. **Searches** — DuckDuckGo (default, no API key) or Tavily (optional, higher quality), spam filtered, recency-aware 2. **Fetches** — full Playwright browser for JS-heavy sites (Yahoo Finance, TradingView, Bloomberg), fast httpx for simple sites 3. **Extracts** — trafilatura + BeautifulSoup strip all noise (nav/ads/footer) 4. **Compresses** — sentence-level relevance scoring keeps only what matters for your query @@ -94,6 +94,11 @@ python -m playwright install chromium pip install "mini-browser[mcp] @ git+https://github.com/ghanibot/mini-browser.git" ``` +### With Tavily search +```bash +pip install "mini-browser[tavily] @ git+https://github.com/ghanibot/mini-browser.git" +``` + ### With PDF support ```bash pip install "mini-browser[pdf] @ git+https://github.com/ghanibot/mini-browser.git" @@ -105,7 +110,7 @@ pip install "mini-browser[full] @ git+https://github.com/ghanibot/mini-browser.g python -m playwright install chromium ``` -**Requirements:** Python 3.10+, internet connection, no API keys needed. +**Requirements:** Python 3.10+, internet connection. No API keys needed for default DuckDuckGo search; Tavily requires a `TAVILY_API_KEY` (see [Configuration](#search-provider) below). --- @@ -238,7 +243,7 @@ result = handle_tool_call(tool_name, tool_arguments) | **Recency detection** | Queries containing "terbaru/latest/hari ini" auto-apply DuckDuckGo time filter | | **Retry with backoff** | Failed fetches retry 2x with exponential backoff | | **Configurable domains** | Add custom JS-heavy domains via env var or `.mini-browser.json` | -| **No API keys** | Uses DuckDuckGo — completely free | +| **No API keys** | Uses DuckDuckGo by default — completely free. Optional Tavily provider for higher quality | --- @@ -258,6 +263,28 @@ Fetching a news article: ## Configuration +### Search Provider + +By default, mini-browser uses DuckDuckGo (no API key required). You can switch to [Tavily](https://tavily.com) for higher-quality, AI-optimized search results. + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| `MINI_BROWSER_SEARCH_PROVIDER` | Search backend to use: `duckduckgo` or `tavily` | `duckduckgo` | +| `TAVILY_API_KEY` | Your Tavily API key (required when provider is `tavily`) | — | + +```bash +# Install with Tavily support +pip install "mini-browser[tavily] @ git+https://github.com/ghanibot/mini-browser.git" + +# Use Tavily as search provider +export MINI_BROWSER_SEARCH_PROVIDER=tavily +export TAVILY_API_KEY=tvly-YOUR_API_KEY + +mini-browser search "latest AI news today" +``` + +Get a free Tavily API key (1,000 credits/month) at [app.tavily.com](https://app.tavily.com). + ### Custom JS-heavy domains **Via env var:** diff --git a/mini_browser/config.py b/mini_browser/config.py index e3563a6..10dcaf8 100644 --- a/mini_browser/config.py +++ b/mini_browser/config.py @@ -83,3 +83,8 @@ def add_js_domain(domain: str) -> None: domains = get_js_domains() domains.add(domain.removeprefix("www.")) _cached_domains = domains + + +def get_search_provider() -> str: + """Return the configured search provider ('duckduckgo' or 'tavily').""" + return os.environ.get("MINI_BROWSER_SEARCH_PROVIDER", "duckduckgo").lower() diff --git a/mini_browser/search.py b/mini_browser/search.py index 1aeb1e9..60a01b1 100644 --- a/mini_browser/search.py +++ b/mini_browser/search.py @@ -1,3 +1,4 @@ +import logging import re from urllib.parse import urlparse @@ -6,6 +7,10 @@ except ImportError: from duckduckgo_search import DDGS +from mini_browser.config import get_search_provider + +_log = logging.getLogger(__name__) + # Domains known for spam, clickbait, or low-quality content _BLOCKLIST_PATTERNS = [ r"\.store/", @@ -31,10 +36,25 @@ def search_urls( timelimit: str | None = None, ) -> list[dict]: """ - Search DuckDuckGo. Returns list of dicts: href, title, body. + Search the web. Returns list of dicts: href, title, body. + + Provider is selected via MINI_BROWSER_SEARCH_PROVIDER env var + ('duckduckgo' default, or 'tavily'). timelimit: "d" (day), "w" (week), "m" (month), "y" (year) """ + provider = get_search_provider() + if provider == "tavily": + return _search_tavily(query, max_results, timelimit) + return _search_ddgs(query, max_results, timelimit) + + +def _search_ddgs( + query: str, + max_results: int = 5, + timelimit: str | None = None, +) -> list[dict]: + """Search via DuckDuckGo.""" try: kwargs: dict = {"max_results": max_results * 3} if timelimit: @@ -47,6 +67,49 @@ def search_urls( return [] +_TIMELIMIT_TO_TIME_RANGE = { + "d": "day", + "w": "week", + "m": "month", + "y": "year", +} + + +def _search_tavily( + query: str, + max_results: int = 5, + timelimit: str | None = None, +) -> list[dict]: + """Search via Tavily and normalise results to {href, title, body}.""" + try: + from tavily import TavilyClient + except ImportError: + raise ImportError( + "tavily-python is required for the Tavily search provider. " + "Install it with: pip install mini-browser[tavily]" + ) + + try: + client = TavilyClient() + kwargs: dict = {"max_results": max_results} + if timelimit: + time_range = _TIMELIMIT_TO_TIME_RANGE.get(timelimit) + if time_range: + kwargs["time_range"] = time_range + response = client.search(query, **kwargs) + return [ + { + "href": r.get("url", ""), + "title": r.get("title", ""), + "body": r.get("content", ""), + } + for r in response.get("results", []) + ] + except Exception: + _log.warning("Tavily search failed for query %r", query, exc_info=True) + return [] + + def _is_quality(result: dict) -> bool: url = result.get("href", "") snippet = result.get("body", "") diff --git a/pyproject.toml b/pyproject.toml index e7673b5..f3b13a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ dependencies = [ mcp = ["mcp>=1.0.0"] playwright = ["playwright>=1.40.0"] pdf = ["pdfplumber>=0.10.0", "pypdf>=4.0.0"] -full = ["mcp>=1.0.0", "playwright>=1.40.0", "pdfplumber>=0.10.0", "pypdf>=4.0.0"] +tavily = ["tavily-python>=0.5.0"] +full = ["mcp>=1.0.0", "playwright>=1.40.0", "pdfplumber>=0.10.0", "pypdf>=4.0.0", "tavily-python>=0.5.0"] dev = [ "pytest>=8.0.0", "pytest-asyncio>=0.23.0",