diff --git a/examples/async_crawl_example.py b/examples/async_crawl_example.py new file mode 100644 index 0000000..0b3b2b0 --- /dev/null +++ b/examples/async_crawl_example.py @@ -0,0 +1,33 @@ +""" +Async crawl example. +""" + +import asyncio +import json + +from scrapegraph_py import AsyncClient + + +async def main(): + async with AsyncClient() as client: + # Start crawl + job = await client.crawl.start( + "https://example.com", + depth=2, + max_pages=5, + ) + print("Crawl started:", json.dumps(job, indent=2)) + + # Poll for completion + crawl_id = job["id"] + while True: + status = await client.crawl.status(crawl_id) + print(f"Status: {status.get('status')}") + if status.get("status") in ("completed", "failed"): + break + await asyncio.sleep(2) + + print("\nResult:", json.dumps(status, indent=2)) + + +asyncio.run(main()) diff --git a/examples/async_credits_example.py b/examples/async_credits_example.py new file mode 100644 index 0000000..40ee7cf --- /dev/null +++ b/examples/async_credits_example.py @@ -0,0 +1,17 @@ +""" +Async credits check. +""" + +import asyncio +import json + +from scrapegraph_py import AsyncClient + + +async def main(): + async with AsyncClient() as client: + credits = await client.credits() + print(json.dumps(credits, indent=2)) + + +asyncio.run(main()) diff --git a/examples/async_extract_example.py b/examples/async_extract_example.py new file mode 100644 index 0000000..b7442fa --- /dev/null +++ b/examples/async_extract_example.py @@ -0,0 +1,40 @@ +""" +Async extract example - extract data from multiple pages concurrently. +""" + +import asyncio +import json + +from pydantic import BaseModel, Field + +from scrapegraph_py import AsyncClient + + +class PageInfo(BaseModel): + title: str = Field(description="Page title") + description: str = Field(description="Brief description of the page content") + + +async def main(): + async with AsyncClient() as client: + urls = [ + "https://example.com", + "https://httpbin.org/html", + ] + + tasks = [ + client.extract( + url=url, + prompt="Extract the page title and a brief description", + output_schema=PageInfo, + ) + for url in urls + ] + results = await asyncio.gather(*tasks) + + for url, result in zip(urls, results): + print(f"\n=== {url} ===") + print(json.dumps(result, indent=2)) + + +asyncio.run(main()) diff --git a/examples/async_monitor_example.py b/examples/async_monitor_example.py new file mode 100644 index 0000000..4f913cc --- /dev/null +++ b/examples/async_monitor_example.py @@ -0,0 +1,27 @@ +""" +Async monitor example. +""" + +import asyncio +import json + +from scrapegraph_py import AsyncClient + + +async def main(): + async with AsyncClient() as client: + # Create a monitor + monitor = await client.monitor.create( + name="Async Price Tracker", + url="https://example.com/products", + prompt="Extract product prices", + cron="0 12 * * *", # Every day at noon + ) + print("Created:", json.dumps(monitor, indent=2)) + + # List all monitors + all_monitors = await client.monitor.list() + print("\nAll monitors:", json.dumps(all_monitors, indent=2)) + + +asyncio.run(main()) diff --git a/examples/async_scrape_example.py b/examples/async_scrape_example.py new file mode 100644 index 0000000..63a9ddd --- /dev/null +++ b/examples/async_scrape_example.py @@ -0,0 +1,27 @@ +""" +Async scrape example - scrape multiple pages concurrently. +""" + +import asyncio +import json + +from scrapegraph_py import AsyncClient + + +async def main(): + async with AsyncClient() as client: + # Scrape multiple pages concurrently + urls = [ + "https://example.com", + "https://httpbin.org/html", + ] + + tasks = [client.scrape(url) for url in urls] + results = await asyncio.gather(*tasks) + + for url, result in zip(urls, results): + print(f"\n=== {url} ===") + print(json.dumps(result, indent=2)) + + +asyncio.run(main()) diff --git a/examples/async_search_example.py b/examples/async_search_example.py new file mode 100644 index 0000000..3cdcc1a --- /dev/null +++ b/examples/async_search_example.py @@ -0,0 +1,26 @@ +""" +Async search example - run multiple searches concurrently. +""" + +import asyncio +import json + +from scrapegraph_py import AsyncClient + + +async def main(): + async with AsyncClient() as client: + queries = [ + "best python frameworks 2025", + "top javascript libraries 2025", + ] + + tasks = [client.search(q, num_results=3) for q in queries] + results = await asyncio.gather(*tasks) + + for query, result in zip(queries, results): + print(f"\n=== {query} ===") + print(json.dumps(result, indent=2)) + + +asyncio.run(main()) diff --git a/examples/crawl_basic_example.py b/examples/crawl_basic_example.py new file mode 100644 index 0000000..1b18135 --- /dev/null +++ b/examples/crawl_basic_example.py @@ -0,0 +1,35 @@ +""" +Crawl a website and get pages as markdown. + +The crawl endpoint discovers and fetches multiple pages from a website, +starting from a given URL and following links up to a specified depth. +""" + +import json +import time + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +# Start the crawl +job = client.crawl.start( + "https://example.com", + depth=2, + max_pages=5, + format="markdown", +) +print("Crawl started:", json.dumps(job, indent=2)) + +# Poll for status +crawl_id = job["id"] +while True: + status = client.crawl.status(crawl_id) + print(f"Status: {status.get('status')}") + if status.get("status") in ("completed", "failed"): + break + time.sleep(2) + +print("\nFinal result:", json.dumps(status, indent=2)) + +client.close() diff --git a/examples/crawl_stop_resume_example.py b/examples/crawl_stop_resume_example.py new file mode 100644 index 0000000..92a2cd5 --- /dev/null +++ b/examples/crawl_stop_resume_example.py @@ -0,0 +1,26 @@ +""" +Stop and resume a crawl job. + +You can stop a running crawl and resume it later. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +# Start a crawl +job = client.crawl.start("https://example.com", depth=3, max_pages=50) +crawl_id = job["id"] +print("Crawl started:", crawl_id) + +# Stop the crawl +stopped = client.crawl.stop(crawl_id) +print("Stopped:", json.dumps(stopped, indent=2)) + +# Resume the crawl later +resumed = client.crawl.resume(crawl_id) +print("Resumed:", json.dumps(resumed, indent=2)) + +client.close() diff --git a/examples/crawl_with_fetch_config_example.py b/examples/crawl_with_fetch_config_example.py new file mode 100644 index 0000000..ca6a3ea --- /dev/null +++ b/examples/crawl_with_fetch_config_example.py @@ -0,0 +1,27 @@ +""" +Crawl with custom fetch configuration. + +Use FetchConfig to enable stealth mode, JS rendering, etc. for all +pages during the crawl. +""" + +import json + +from scrapegraph_py import Client, FetchConfig + +client = Client() # uses SGAI_API_KEY env var + +job = client.crawl.start( + "https://example.com", + depth=2, + max_pages=10, + format="html", + fetch_config=FetchConfig( + stealth=True, + render_js=True, + wait_ms=1000, + ), +) +print("Crawl started:", json.dumps(job, indent=2)) + +client.close() diff --git a/examples/crawl_with_patterns_example.py b/examples/crawl_with_patterns_example.py new file mode 100644 index 0000000..21091da --- /dev/null +++ b/examples/crawl_with_patterns_example.py @@ -0,0 +1,24 @@ +""" +Crawl a website with URL pattern filtering. + +Use include_patterns and exclude_patterns to control which pages +the crawler visits. Patterns support * (any chars) and ** (any path segments). +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +job = client.crawl.start( + "https://example.com", + depth=3, + max_pages=20, + format="markdown", + include_patterns=["/blog/*", "/docs/**"], + exclude_patterns=["/admin/*", "/api/*"], +) +print("Crawl started:", json.dumps(job, indent=2)) + +client.close() diff --git a/examples/credits_example.py b/examples/credits_example.py new file mode 100644 index 0000000..2bc8026 --- /dev/null +++ b/examples/credits_example.py @@ -0,0 +1,14 @@ +""" +Check your remaining API credits. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +credits = client.credits() +print(json.dumps(credits, indent=2)) + +client.close() diff --git a/examples/extract_basic_example.py b/examples/extract_basic_example.py new file mode 100644 index 0000000..177613b --- /dev/null +++ b/examples/extract_basic_example.py @@ -0,0 +1,20 @@ +""" +Extract structured data from a webpage using a natural language prompt. + +The extract endpoint uses AI to understand your prompt and pull out +exactly the data you need. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +result = client.extract( + url="https://example.com", + prompt="Extract the page title and main description", +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/extract_with_fetch_config_example.py b/examples/extract_with_fetch_config_example.py new file mode 100644 index 0000000..112b891 --- /dev/null +++ b/examples/extract_with_fetch_config_example.py @@ -0,0 +1,26 @@ +""" +Extract data from a JavaScript-heavy page using FetchConfig. + +Use FetchConfig to enable stealth mode, JS rendering, scrolling, +and other options needed for dynamic pages. +""" + +import json + +from scrapegraph_py import Client, FetchConfig + +client = Client() # uses SGAI_API_KEY env var + +result = client.extract( + url="https://example.com", + prompt="Extract all visible text content", + fetch_config=FetchConfig( + stealth=True, + render_js=True, + wait_ms=2000, + scrolls=3, + ), +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/extract_with_json_schema_example.py b/examples/extract_with_json_schema_example.py new file mode 100644 index 0000000..e40de5c --- /dev/null +++ b/examples/extract_with_json_schema_example.py @@ -0,0 +1,40 @@ +""" +Extract structured data using a raw JSON Schema dict. + +You can pass a JSON Schema dictionary directly if you prefer not to +use Pydantic models. +""" + +import json + +from scrapegraph_py import Client + +schema = { + "type": "object", + "properties": { + "title": {"type": "string", "description": "Page title"}, + "links": { + "type": "array", + "items": { + "type": "object", + "properties": { + "text": {"type": "string"}, + "href": {"type": "string"}, + }, + }, + "description": "All links on the page", + }, + }, + "required": ["title", "links"], +} + +client = Client() # uses SGAI_API_KEY env var + +result = client.extract( + url="https://example.com", + prompt="Extract the page title and all links", + output_schema=schema, +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/extract_with_llm_config_example.py b/examples/extract_with_llm_config_example.py new file mode 100644 index 0000000..f670c5c --- /dev/null +++ b/examples/extract_with_llm_config_example.py @@ -0,0 +1,23 @@ +""" +Extract data with custom LLM configuration. + +Use LlmConfig to control the model, temperature, and other LLM parameters. +""" + +import json + +from scrapegraph_py import Client, LlmConfig + +client = Client() # uses SGAI_API_KEY env var + +result = client.extract( + url="https://example.com", + prompt="Extract a detailed summary of the page content", + llm_config=LlmConfig( + temperature=0.3, + max_tokens=1000, + ), +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/extract_with_pydantic_schema_example.py b/examples/extract_with_pydantic_schema_example.py new file mode 100644 index 0000000..2f12e8a --- /dev/null +++ b/examples/extract_with_pydantic_schema_example.py @@ -0,0 +1,37 @@ +""" +Extract structured data using a Pydantic model as the output schema. + +When you pass a Pydantic BaseModel class, the SDK automatically converts +it to a JSON Schema and sends it to the API. This ensures the response +matches your expected structure. +""" + +import json +from typing import List, Optional + +from pydantic import BaseModel, Field + +from scrapegraph_py import Client + + +class Product(BaseModel): + name: str = Field(description="Product name") + price: float = Field(description="Product price in USD") + description: Optional[str] = Field(description="Product description") + in_stock: bool = Field(description="Whether the product is in stock") + + +class ProductList(BaseModel): + products: List[Product] = Field(description="List of products found on the page") + + +client = Client() # uses SGAI_API_KEY env var + +result = client.extract( + url="https://example.com", + prompt="Extract all products with their prices and availability", + output_schema=ProductList, +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/history_example.py b/examples/history_example.py new file mode 100644 index 0000000..ad57f5f --- /dev/null +++ b/examples/history_example.py @@ -0,0 +1,29 @@ +""" +Retrieve your API request history. + +The history endpoint lets you review past requests with optional filters. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +# Get all recent history +history = client.history() +print("Recent history:", json.dumps(history, indent=2)) + +# Filter by endpoint +scrape_history = client.history(endpoint="scrape", limit=5) +print("\nScrape history:", json.dumps(scrape_history, indent=2)) + +# Filter by status +completed = client.history(status="completed", limit=10) +print("\nCompleted requests:", json.dumps(completed, indent=2)) + +# Paginate +page2 = client.history(limit=10, offset=10) +print("\nPage 2:", json.dumps(page2, indent=2)) + +client.close() diff --git a/examples/monitor_create_example.py b/examples/monitor_create_example.py new file mode 100644 index 0000000..c76931b --- /dev/null +++ b/examples/monitor_create_example.py @@ -0,0 +1,22 @@ +""" +Create a monitor to track changes on a webpage. + +Monitors run on a cron schedule and use AI to extract data each time. +This replaces the old scheduled jobs API. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +monitor = client.monitor.create( + name="Daily Price Tracker", + url="https://example.com/products", + prompt="Extract all product names and prices", + cron="0 9 * * *", # Every day at 9am +) +print("Monitor created:", json.dumps(monitor, indent=2)) + +client.close() diff --git a/examples/monitor_manage_example.py b/examples/monitor_manage_example.py new file mode 100644 index 0000000..76094a6 --- /dev/null +++ b/examples/monitor_manage_example.py @@ -0,0 +1,36 @@ +""" +List, pause, resume, and delete monitors. + +Shows all lifecycle operations for managing monitors. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +# List all monitors +monitors = client.monitor.list() +print("All monitors:", json.dumps(monitors, indent=2)) + +# If you have a monitor ID, you can manage it: +# monitor_id = "your-monitor-id" + +# Get details +# details = client.monitor.get(monitor_id) +# print("Details:", json.dumps(details, indent=2)) + +# Pause a monitor +# paused = client.monitor.pause(monitor_id) +# print("Paused:", json.dumps(paused, indent=2)) + +# Resume a paused monitor +# resumed = client.monitor.resume(monitor_id) +# print("Resumed:", json.dumps(resumed, indent=2)) + +# Delete a monitor +# deleted = client.monitor.delete(monitor_id) +# print("Deleted:", json.dumps(deleted, indent=2)) + +client.close() diff --git a/examples/monitor_with_config_example.py b/examples/monitor_with_config_example.py new file mode 100644 index 0000000..22c12f1 --- /dev/null +++ b/examples/monitor_with_config_example.py @@ -0,0 +1,27 @@ +""" +Create a monitor with custom fetch and LLM configuration. +""" + +import json + +from scrapegraph_py import Client, FetchConfig, LlmConfig + +client = Client() # uses SGAI_API_KEY env var + +monitor = client.monitor.create( + name="Stealth News Monitor", + url="https://example.com/news", + prompt="Extract the top 5 news headlines with their dates", + cron="0 */6 * * *", # Every 6 hours + fetch_config=FetchConfig( + stealth=True, + render_js=True, + wait_ms=2000, + ), + llm_config=LlmConfig( + temperature=0.1, + ), +) +print("Monitor created:", json.dumps(monitor, indent=2)) + +client.close() diff --git a/examples/monitor_with_schema_example.py b/examples/monitor_with_schema_example.py new file mode 100644 index 0000000..db95265 --- /dev/null +++ b/examples/monitor_with_schema_example.py @@ -0,0 +1,41 @@ +""" +Create a monitor with a JSON Schema for structured output. + +The output_schema ensures each extraction returns data in a +consistent format. +""" + +import json + +from scrapegraph_py import Client + +schema = { + "type": "object", + "properties": { + "products": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "price": {"type": "number"}, + "currency": {"type": "string"}, + }, + "required": ["name", "price"], + }, + }, + }, +} + +client = Client() # uses SGAI_API_KEY env var + +monitor = client.monitor.create( + name="Weekly Product Monitor", + url="https://example.com/shop", + prompt="Extract all product names, prices, and currencies", + cron="0 8 * * 1", # Every Monday at 8am + output_schema=schema, +) +print("Monitor created:", json.dumps(monitor, indent=2)) + +client.close() diff --git a/examples/scrape_html_example.py b/examples/scrape_html_example.py new file mode 100644 index 0000000..d97b716 --- /dev/null +++ b/examples/scrape_html_example.py @@ -0,0 +1,14 @@ +""" +Scrape a webpage and get the raw HTML content. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +result = client.scrape("https://example.com", format="html") +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/scrape_markdown_example.py b/examples/scrape_markdown_example.py new file mode 100644 index 0000000..78ecbe1 --- /dev/null +++ b/examples/scrape_markdown_example.py @@ -0,0 +1,16 @@ +""" +Scrape a webpage and get the content as clean markdown. + +This is the simplest way to get readable content from any URL. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +result = client.scrape("https://example.com") +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/scrape_screenshot_example.py b/examples/scrape_screenshot_example.py new file mode 100644 index 0000000..71ad252 --- /dev/null +++ b/examples/scrape_screenshot_example.py @@ -0,0 +1,14 @@ +""" +Scrape a webpage and capture a screenshot. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +result = client.scrape("https://example.com", format="screenshot") +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/scrape_with_fetch_config_example.py b/examples/scrape_with_fetch_config_example.py new file mode 100644 index 0000000..e5d2f1a --- /dev/null +++ b/examples/scrape_with_fetch_config_example.py @@ -0,0 +1,28 @@ +""" +Scrape a webpage with custom fetch configuration. + +FetchConfig allows you to control stealth mode, JavaScript rendering, +wait times, cookies, headers, country-based geolocation, and more. +""" + +import json + +from scrapegraph_py import Client, FetchConfig + +client = Client() # uses SGAI_API_KEY env var + +result = client.scrape( + "https://example.com", + format="markdown", + fetch_config=FetchConfig( + stealth=True, + render_js=True, + wait_ms=3000, + headers={"User-Agent": "MyBot/1.0"}, + cookies={"session": "abc123"}, + country="us", + ), +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/search_basic_example.py b/examples/search_basic_example.py new file mode 100644 index 0000000..3aaa74d --- /dev/null +++ b/examples/search_basic_example.py @@ -0,0 +1,17 @@ +""" +Search the web and get AI-extracted results. + +The search endpoint performs a web search and uses AI to extract +structured data from the results. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +result = client.search("best python web scraping libraries 2025") +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/search_num_results_example.py b/examples/search_num_results_example.py new file mode 100644 index 0000000..6e2a46c --- /dev/null +++ b/examples/search_num_results_example.py @@ -0,0 +1,21 @@ +""" +Search with a custom number of results. + +num_results controls how many web pages are scraped (3-20). +More results = more comprehensive but costs more credits. +""" + +import json + +from scrapegraph_py import Client + +client = Client() # uses SGAI_API_KEY env var + +# Get more results for deeper research +result = client.search( + query="machine learning frameworks comparison", + num_results=10, +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/examples/search_with_schema_example.py b/examples/search_with_schema_example.py new file mode 100644 index 0000000..64969b4 --- /dev/null +++ b/examples/search_with_schema_example.py @@ -0,0 +1,35 @@ +""" +Search the web with a Pydantic output schema. + +Combine web search with structured extraction to get exactly +the data format you need. +""" + +import json +from typing import List + +from pydantic import BaseModel, Field + +from scrapegraph_py import Client + + +class SearchResult(BaseModel): + title: str = Field(description="Result title") + url: str = Field(description="Result URL") + summary: str = Field(description="Brief summary of the content") + + +class SearchResults(BaseModel): + results: List[SearchResult] = Field(description="List of search results") + + +client = Client() # uses SGAI_API_KEY env var + +result = client.search( + query="latest AI news", + num_results=5, + output_schema=SearchResults, +) +print(json.dumps(result, indent=2)) + +client.close() diff --git a/scrapegraph-py/MIGRATION_V2.md b/scrapegraph-py/MIGRATION_V2.md new file mode 100644 index 0000000..a55398a --- /dev/null +++ b/scrapegraph-py/MIGRATION_V2.md @@ -0,0 +1,649 @@ +# Migration Guide: scrapegraph-py v1 → v2 + +This guide covers all breaking changes when upgrading from `scrapegraph-py` v1.x to v2.0. + +## Installation + +```bash +pip install scrapegraph-py==2.0.0 +``` + +## Overview of Changes + +| Area | v1 | v2 | +|------|----|----| +| **Package version** | 1.x | 2.0.0 | +| **API base URL** | `https://api.scrapegraphai.com/v1` | `https://api.scrapegraphai.com/api/v2` | +| **Auth header** | `SGAI-APIKEY: ` | `Authorization: Bearer ` (+ `SGAI-APIKEY` for backwards compat) | +| **SDK version header** | None | `X-SDK-Version: python@2.0.0` | +| **Client init** | `Client(api_key=...)` | `Client(api_key=..., base_url=...)` | +| **Crawl methods** | `client.crawl(...)` | `client.crawl.start(...)` (namespaced) | +| **Scheduled jobs** | `client.create_scheduled_job(...)` | `client.monitor.create(...)` (namespaced) | + +--- + +## Client Initialization + +The `Client` and `AsyncClient` constructors now accept an optional `base_url` parameter and no longer support `mock`, `mock_handler`, or `mock_responses`. + +### v1 + +```python +from scrapegraph_py import Client + +client = Client( + api_key="sgai-...", + verify_ssl=True, + timeout=30, + max_retries=3, + retry_delay=1.0, + mock=False, + mock_handler=None, + mock_responses=None, +) +``` + +### v2 + +```python +from scrapegraph_py import Client + +client = Client( + api_key="sgai-...", + base_url="https://api.scrapegraphai.com/api/v2", # optional override + verify_ssl=True, + timeout=30, + max_retries=3, + retry_delay=1.0, +) +``` + +> **Note:** The `mock`, `mock_handler`, and `mock_responses` parameters have been removed. Use standard mocking libraries (`responses`, `aioresponses`, `unittest.mock`) for testing instead. + +--- + +## Endpoint Migration Reference + +### SmartScraper → `extract()` + +The `smartscraper()` method has been renamed to `extract()`. The parameter names have changed. + +#### v1 + +```python +response = client.smartscraper( + website_url="https://example.com", + user_prompt="Extract the main heading and description", + output_schema=MyPydanticModel, + headers={"User-Agent": "MyBot"}, + cookies={"session": "abc123"}, + number_of_scrolls=3, + render_heavy_js=True, + stealth=True, + wait_ms=2000, +) + +# Get result by ID +result = client.get_smartscraper(request_id) +``` + +#### v2 + +```python +from scrapegraph_py import FetchConfig + +response = client.extract( + url="https://example.com", + prompt="Extract the main heading and description", + output_schema=MyPydanticModel, + fetch_config=FetchConfig( + headers={"User-Agent": "MyBot"}, + cookies={"session": "abc123"}, + scrolls=3, + render_js=True, + stealth=True, + wait_ms=2000, + ), +) +``` + +| v1 parameter | v2 equivalent | +|---|---| +| `website_url` | `url` | +| `user_prompt` | `prompt` | +| `output_schema` | `output_schema` (unchanged) | +| `headers` | `fetch_config=FetchConfig(headers=...)` | +| `cookies` | `fetch_config=FetchConfig(cookies=...)` | +| `number_of_scrolls` | `fetch_config=FetchConfig(scrolls=...)` | +| `render_heavy_js` | `fetch_config=FetchConfig(render_js=...)` | +| `stealth` | `fetch_config=FetchConfig(stealth=...)` | +| `wait_ms` | `fetch_config=FetchConfig(wait_ms=...)` | +| `mock` | Removed | +| `plain_text` | Removed | +| `total_pages` | Removed | +| `website_html` | Removed (URL only) | +| `website_markdown` | Removed (URL only) | +| `return_toon` | Removed | + +> **Note:** `get_smartscraper()` has been removed. The `extract()` response is returned directly. + +--- + +### SearchScraper → `search()` + +#### v1 + +```python +response = client.searchscraper( + user_prompt="What is the latest version of Python?", + num_results=5, + output_schema=MyModel, + extraction_mode=True, + stealth=True, + location_geo_code="us", + time_range=TimeRange.PAST_WEEK, +) + +result = client.get_searchscraper(request_id) +``` + +#### v2 + +```python +response = client.search( + query="What is the latest version of Python?", + num_results=5, + output_schema=MyModel, +) +``` + +| v1 parameter | v2 equivalent | +|---|---| +| `user_prompt` | `query` | +| `num_results` | `num_results` (unchanged) | +| `output_schema` | `output_schema` (unchanged) | +| `extraction_mode` | Removed (always AI extraction) | +| `stealth` | Removed | +| `location_geo_code` | Removed | +| `time_range` | Removed | +| `mock` | Removed | +| `return_toon` | Removed | + +> **Note:** `get_searchscraper()` has been removed. + +--- + +### Scrape → `scrape()` + +The `scrape()` method name stays the same but the parameters and request format have changed. v2 uses a format-based approach (markdown, html, screenshot, branding). + +#### v1 + +```python +response = client.scrape( + website_url="https://example.com", + render_heavy_js=True, + branding=True, + headers={"User-Agent": "MyBot"}, + stealth=True, + wait_ms=2000, +) + +result = client.get_scrape(request_id) +``` + +#### v2 + +```python +from scrapegraph_py import FetchConfig + +# Get markdown (default) +response = client.scrape("https://example.com") + +# Get HTML +response = client.scrape("https://example.com", format="html") + +# Get screenshot +response = client.scrape("https://example.com", format="screenshot") + +# With fetch config +response = client.scrape( + "https://example.com", + format="markdown", + fetch_config=FetchConfig( + render_js=True, + stealth=True, + wait_ms=2000, + headers={"User-Agent": "MyBot"}, + ), +) +``` + +| v1 parameter | v2 equivalent | +|---|---| +| `website_url` | `url` (positional) | +| `render_heavy_js` | `fetch_config=FetchConfig(render_js=...)` | +| `branding` | `format="branding"` | +| `headers` | `fetch_config=FetchConfig(headers=...)` | +| `stealth` | `fetch_config=FetchConfig(stealth=...)` | +| `wait_ms` | `fetch_config=FetchConfig(wait_ms=...)` | +| `mock` | Removed | +| `return_toon` | Removed | + +> **Note:** `get_scrape()` has been removed. + +--- + +### Markdownify → `scrape(format="markdown")` + +The `markdownify()` endpoint has been replaced by `scrape()` with `format="markdown"`. + +#### v1 + +```python +response = client.markdownify( + website_url="https://example.com", + render_heavy_js=True, + stealth=True, +) +``` + +#### v2 + +```python +response = client.scrape( + "https://example.com", + format="markdown", + fetch_config=FetchConfig(render_js=True, stealth=True), +) +``` + +--- + +### Crawl → `crawl.start()` / `crawl.status()` / `crawl.stop()` / `crawl.resume()` + +Crawl methods are now **namespaced** under `client.crawl.*`. The parameter names have been simplified. + +#### v1 + +```python +# Start a crawl +response = client.crawl( + url="https://example.com", + prompt="Extract page titles", + data_schema={"type": "object", "properties": {"title": {"type": "string"}}}, + extraction_mode=True, + depth=2, + max_pages=10, + same_domain_only=True, + batch_size=5, + sitemap=False, + headers={"User-Agent": "MyBot"}, + render_heavy_js=True, + stealth=True, + include_paths=["/blog/*"], + exclude_paths=["/admin/*"], + webhook_url="https://example.com/webhook", + wait_ms=1000, +) + +# Get status +result = client.get_crawl(crawl_id) +``` + +#### v2 + +```python +from scrapegraph_py import FetchConfig + +# Start a crawl +response = client.crawl.start( + "https://example.com", + depth=2, + max_pages=10, + format="markdown", # or "html" + include_patterns=["/blog/*"], + exclude_patterns=["/admin/*"], + fetch_config=FetchConfig( + render_js=True, + stealth=True, + wait_ms=1000, + headers={"User-Agent": "MyBot"}, + ), +) + +# Get status +result = client.crawl.status(crawl_id) + +# Stop a running crawl (NEW) +client.crawl.stop(crawl_id) + +# Resume a stopped crawl (NEW) +client.crawl.resume(crawl_id) +``` + +| v1 parameter | v2 equivalent | +|---|---| +| `url` | `url` (positional) | +| `prompt` | Removed (use `format` instead) | +| `data_schema` | Removed (use `format` instead) | +| `extraction_mode` | Replaced by `format` ("markdown" or "html") | +| `depth` | `depth` (unchanged) | +| `max_pages` | `max_pages` (unchanged) | +| `include_paths` | `include_patterns` | +| `exclude_paths` | `exclude_patterns` | +| `headers`, `stealth`, `render_heavy_js`, `wait_ms` | Moved to `fetch_config=FetchConfig(...)` | +| `same_domain_only` | Removed | +| `batch_size` | Removed | +| `sitemap` | Removed | +| `cache_website` | Removed | +| `breadth` | Removed | +| `webhook_url` | Removed | +| `return_toon` | Removed | + +| v1 method | v2 method | +|---|---| +| `client.crawl(...)` | `client.crawl.start(...)` | +| `client.get_crawl(id)` | `client.crawl.status(id)` | +| — | `client.crawl.stop(id)` **(NEW)** | +| — | `client.crawl.resume(id)` **(NEW)** | + +--- + +### Scheduled Jobs → `monitor.*` + +The entire scheduled jobs API has been replaced by the **monitor** namespace. Monitors are simpler: instead of configuring a `service_type` + `job_config`, you directly provide a `url`, `prompt`, and `cron`. + +#### v1 + +```python +# Create +job = client.create_scheduled_job( + job_name="Daily Scraper", + service_type="smartscraper", + cron_expression="0 9 * * *", + job_config={ + "website_url": "https://example.com", + "user_prompt": "Extract company info", + }, + is_active=True, +) + +# List +jobs = client.get_scheduled_jobs(page=1, page_size=20) + +# Get one +job = client.get_scheduled_job(job_id) + +# Update +client.update_scheduled_job(job_id, job_name="Updated Name") + +# Pause / Resume / Delete +client.pause_scheduled_job(job_id) +client.resume_scheduled_job(job_id) +client.delete_scheduled_job(job_id) + +# Trigger manually +client.trigger_scheduled_job(job_id) + +# Execution history +execs = client.get_job_executions(job_id, page=1, page_size=20) +``` + +#### v2 + +```python +from scrapegraph_py import FetchConfig, LlmConfig + +# Create +monitor = client.monitor.create( + name="Daily Scraper", + url="https://example.com", + prompt="Extract company info", + cron="0 9 * * *", + output_schema={"type": "object", "properties": {"name": {"type": "string"}}}, + fetch_config=FetchConfig(stealth=True), + llm_config=LlmConfig(temperature=0.1), +) + +# List +monitors = client.monitor.list() + +# Get one +monitor = client.monitor.get(monitor_id) + +# Pause / Resume / Delete +client.monitor.pause(monitor_id) +client.monitor.resume(monitor_id) +client.monitor.delete(monitor_id) +``` + +| v1 method | v2 method | +|---|---| +| `client.create_scheduled_job(...)` | `client.monitor.create(...)` | +| `client.get_scheduled_jobs(...)` | `client.monitor.list()` | +| `client.get_scheduled_job(id)` | `client.monitor.get(id)` | +| `client.pause_scheduled_job(id)` | `client.monitor.pause(id)` | +| `client.resume_scheduled_job(id)` | `client.monitor.resume(id)` | +| `client.delete_scheduled_job(id)` | `client.monitor.delete(id)` | +| `client.update_scheduled_job(...)` | Removed | +| `client.replace_scheduled_job(...)` | Removed | +| `client.trigger_scheduled_job(id)` | Removed | +| `client.get_job_executions(...)` | Removed | + +--- + +### Schema Generation → `schema()` + +#### v1 + +```python +response = client.generate_schema( + user_prompt="Product with name, price, and rating", + existing_schema=None, +) + +status = client.get_schema_status(request_id) +``` + +#### v2 + +```python +response = client.schema( + prompt="Product with name, price, and rating", + existing_schema=None, +) +``` + +| v1 parameter | v2 equivalent | +|---|---| +| `user_prompt` | `prompt` | +| `existing_schema` | `existing_schema` (unchanged) | + +> **Note:** `get_schema_status()` has been removed. + +--- + +### Credits → `credits()` + +#### v1 + +```python +credits = client.get_credits() +``` + +#### v2 + +```python +credits = client.credits() +``` + +--- + +### New Endpoints in v2 + +#### `history()` + +Retrieve your API request history with optional filters. This is new in v2. + +```python +# Get all recent history +history = client.history() + +# With filters +history = client.history( + endpoint="scrape", + status="completed", + limit=10, + offset=0, +) +``` + +--- + +## Removed Endpoints + +The following v1 endpoints have been **removed** in v2: + +| Removed endpoint | Replacement | +|---|---| +| `client.markdownify()` | `client.scrape(url, format="markdown")` | +| `client.get_markdownify(id)` | Removed (response is direct) | +| `client.agenticscraper()` | Removed | +| `client.get_agenticscraper(id)` | Removed | +| `client.sitemap(url)` | Removed | +| `client.healthz()` | Removed | +| `client.submit_feedback(...)` | Removed | +| All `get_*` polling methods | Removed (responses are direct) | + +--- + +## Shared Configuration Models + +v2 introduces `FetchConfig` and `LlmConfig` — reusable configuration objects that replace the scattered per-method parameters from v1. + +### FetchConfig + +Controls how pages are fetched. Used by `scrape()`, `extract()`, `crawl.start()`, and `monitor.create()`. + +```python +from scrapegraph_py import FetchConfig + +config = FetchConfig( + mock=False, # Use mock mode for testing + stealth=True, # Avoid bot detection + scrolls=3, # Number of page scrolls (0-100) + country="us", # Geo-located requests + cookies={"k": "v"}, # Cookies to send + headers={"k": "v"}, # Custom HTTP headers + wait_ms=2000, # Wait before scraping (ms) + render_js=True, # Render heavy JavaScript +) +``` + +### LlmConfig + +Controls the AI model used for extraction. Used by `extract()`, `search()`, and `monitor.create()`. + +```python +from scrapegraph_py import LlmConfig + +config = LlmConfig( + model="gpt-4", # LLM model to use + temperature=0.3, # Sampling temperature (0.0-2.0) + max_tokens=1000, # Max tokens in response + chunker="auto", # Chunking strategy for large pages +) +``` + +--- + +## Removed Features + +| Feature | Notes | +|---|---| +| **Mock mode** (`mock=True`) | Use `responses` / `aioresponses` / `unittest.mock` for testing | +| **TOON format** (`return_toon=True`) | Removed entirely | +| **`from_env()` mock support** | `from_env()` no longer accepts `mock`, `mock_handler`, `mock_responses` | +| **`website_html` / `website_markdown` input** | `extract()` only accepts URLs, not raw HTML/markdown | +| **`TimeRange` enum** | Removed (was used by `searchscraper`) | +| **`SitemapRequest` / `SitemapResponse`** | Removed | +| **All `Get*Request` models** | Removed (no more polling by ID) | + +--- + +## Async Client + +The `AsyncClient` has the exact same API surface as `Client` — all the same changes apply. Every method is `async` and crawl/monitor namespaces use `await`: + +```python +import asyncio +from scrapegraph_py import AsyncClient + +async def main(): + async with AsyncClient(api_key="sgai-...") as client: + # All the same methods, just with await + result = await client.scrape("https://example.com") + result = await client.extract("https://example.com", prompt="Extract title") + result = await client.search("python web scraping") + + # Namespaced methods also use await + job = await client.crawl.start("https://example.com", depth=2) + status = await client.crawl.status(job["id"]) + + monitor = await client.monitor.create( + name="Tracker", + url="https://example.com", + prompt="Extract prices", + cron="0 9 * * *", + ) + +asyncio.run(main()) +``` + +--- + +## Quick Find-and-Replace Cheatsheet + +For a fast migration, search your codebase for these patterns: + +| Search for | Replace with | +|---|---| +| `client.smartscraper(` | `client.extract(` | +| `website_url=` | `url=` | +| `user_prompt=` | `prompt=` (in extract/schema) or `query=` (in search) | +| `client.searchscraper(` | `client.search(` | +| `client.markdownify(` | `client.scrape(` | +| `client.get_smartscraper(` | Remove (response is direct) | +| `client.get_searchscraper(` | Remove | +| `client.get_scrape(` | Remove | +| `client.get_markdownify(` | Remove | +| `client.get_crawl(` | `client.crawl.status(` | +| `client.crawl(` | `client.crawl.start(` | +| `client.agenticscraper(` | Remove | +| `client.get_agenticscraper(` | Remove | +| `client.sitemap(` | Remove | +| `client.healthz()` | Remove | +| `client.submit_feedback(` | Remove | +| `client.get_credits()` | `client.credits()` | +| `client.generate_schema(` | `client.schema(` | +| `client.get_schema_status(` | Remove | +| `client.create_scheduled_job(` | `client.monitor.create(` | +| `client.get_scheduled_jobs(` | `client.monitor.list()` | +| `client.get_scheduled_job(` | `client.monitor.get(` | +| `client.pause_scheduled_job(` | `client.monitor.pause(` | +| `client.resume_scheduled_job(` | `client.monitor.resume(` | +| `client.delete_scheduled_job(` | `client.monitor.delete(` | +| `client.trigger_scheduled_job(` | Remove | +| `client.update_scheduled_job(` | Remove | +| `client.replace_scheduled_job(` | Remove | +| `client.get_job_executions(` | Remove | +| `return_toon=True` | Remove | +| `render_heavy_js=` | `fetch_config=FetchConfig(render_js=...)` | +| `from scrapegraph_py.models.smartscraper import` | Remove | +| `from scrapegraph_py.models.searchscraper import` | Remove | +| `from scrapegraph_py.models.markdownify import` | Remove | +| `from scrapegraph_py.models.agenticscraper import` | Remove | +| `from scrapegraph_py.models.sitemap import` | Remove | +| `from scrapegraph_py.models.feedback import` | Remove | +| `from scrapegraph_py.models.scheduled_jobs import` | Remove | diff --git a/scrapegraph-py/examples/advanced_features/cookies/cookies_integration_example.py b/scrapegraph-py/examples/advanced_features/cookies/cookies_integration_example.py deleted file mode 100644 index d7c2d6f..0000000 --- a/scrapegraph-py/examples/advanced_features/cookies/cookies_integration_example.py +++ /dev/null @@ -1,285 +0,0 @@ -""" -Comprehensive example demonstrating cookies integration for web scraping. - -This example shows various real-world scenarios where cookies are essential: -1. E-commerce site scraping with authentication -2. Social media scraping with session cookies -3. Banking/financial site scraping with secure cookies -4. News site scraping with user preferences -5. API endpoint scraping with authentication tokens - -Requirements: -- Python 3.7+ -- scrapegraph-py -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import json -import os -from typing import Optional - -from dotenv import load_dotenv -from pydantic import BaseModel, Field - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -# Define data models for different scenarios -class ProductInfo(BaseModel): - """Model for e-commerce product information.""" - - name: str = Field(description="Product name") - price: str = Field(description="Product price") - availability: str = Field(description="Product availability status") - rating: Optional[str] = Field(description="Product rating", default=None) - - -class SocialMediaPost(BaseModel): - """Model for social media post information.""" - - author: str = Field(description="Post author") - content: str = Field(description="Post content") - likes: Optional[str] = Field(description="Number of likes", default=None) - comments: Optional[str] = Field(description="Number of comments", default=None) - timestamp: Optional[str] = Field(description="Post timestamp", default=None) - - -class NewsArticle(BaseModel): - """Model for news article information.""" - - title: str = Field(description="Article title") - summary: str = Field(description="Article summary") - author: Optional[str] = Field(description="Article author", default=None) - publish_date: Optional[str] = Field(description="Publish date", default=None) - - -class BankTransaction(BaseModel): - """Model for banking transaction information.""" - - date: str = Field(description="Transaction date") - description: str = Field(description="Transaction description") - amount: str = Field(description="Transaction amount") - type: str = Field(description="Transaction type (credit/debit)") - - -def scrape_ecommerce_with_auth(): - """Example: Scrape e-commerce site with authentication cookies.""" - print("=" * 60) - print("E-COMMERCE SITE SCRAPING WITH AUTHENTICATION") - print("=" * 60) - - # Example cookies for an e-commerce site - cookies = { - "session_id": "abc123def456", - "user_id": "user789", - "cart_id": "cart101112", - "preferences": "dark_mode,usd", - "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", - } - - website_url = "https://example-ecommerce.com/products" - user_prompt = ( - "Extract product information including name, price, availability, and rating" - ) - - try: - client = Client.from_env() - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - output_schema=ProductInfo, - number_of_scrolls=5, # Scroll to load more products - ) - - print("✅ E-commerce scraping completed successfully") - print(json.dumps(response, indent=2)) - client.close() - - except Exception as e: - print(f"❌ Error in e-commerce scraping: {str(e)}") - - -def scrape_social_media_with_session(): - """Example: Scrape social media with session cookies.""" - print("\n" + "=" * 60) - print("SOCIAL MEDIA SCRAPING WITH SESSION COOKIES") - print("=" * 60) - - # Example cookies for a social media site - cookies = { - "session_token": "xyz789abc123", - "user_session": "def456ghi789", - "csrf_token": "jkl012mno345", - "remember_me": "true", - "language": "en_US", - } - - website_url = "https://example-social.com/feed" - user_prompt = ( - "Extract posts from the feed including author, content, likes, and comments" - ) - - try: - client = Client.from_env() - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - output_schema=SocialMediaPost, - number_of_scrolls=10, # Scroll to load more posts - ) - - print("✅ Social media scraping completed successfully") - print(json.dumps(response, indent=2)) - client.close() - - except Exception as e: - print(f"❌ Error in social media scraping: {str(e)}") - - -def scrape_news_with_preferences(): - """Example: Scrape news site with user preference cookies.""" - print("\n" + "=" * 60) - print("NEWS SITE SCRAPING WITH USER PREFERENCES") - print("=" * 60) - - # Example cookies for a news site - cookies = { - "user_preferences": "technology,science,ai", - "reading_level": "advanced", - "region": "US", - "subscription_tier": "premium", - "theme": "dark", - } - - website_url = "https://example-news.com/technology" - user_prompt = ( - "Extract news articles including title, summary, author, and publish date" - ) - - try: - client = Client.from_env() - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - output_schema=NewsArticle, - total_pages=3, # Scrape multiple pages - ) - - print("✅ News scraping completed successfully") - print(json.dumps(response, indent=2)) - client.close() - - except Exception as e: - print(f"❌ Error in news scraping: {str(e)}") - - -def scrape_banking_with_secure_cookies(): - """Example: Scrape banking site with secure authentication cookies.""" - print("\n" + "=" * 60) - print("BANKING SITE SCRAPING WITH SECURE COOKIES") - print("=" * 60) - - # Example secure cookies for a banking site - cookies = { - "secure_session": "pqr678stu901", - "auth_token": "vwx234yz567", - "mfa_verified": "true", - "device_id": "device_abc123", - "last_activity": "2024-01-15T10:30:00Z", - } - - website_url = "https://example-bank.com/transactions" - user_prompt = ( - "Extract recent transactions including date, description, amount, and type" - ) - - try: - client = Client.from_env() - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - output_schema=BankTransaction, - total_pages=5, # Scrape multiple pages of transactions - ) - - print("✅ Banking scraping completed successfully") - print(json.dumps(response, indent=2)) - client.close() - - except Exception as e: - print(f"❌ Error in banking scraping: {str(e)}") - - -def scrape_api_with_auth_tokens(): - """Example: Scrape API endpoint with authentication tokens.""" - print("\n" + "=" * 60) - print("API ENDPOINT SCRAPING WITH AUTH TOKENS") - print("=" * 60) - - # Example API authentication cookies - cookies = { - "api_token": "api_abc123def456", - "client_id": "client_789", - "access_token": "access_xyz789", - "refresh_token": "refresh_abc123", - "scope": "read:all", - } - - website_url = "https://api.example.com/data" - user_prompt = "Extract data from the API response" - - try: - client = Client.from_env() - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - headers={"Accept": "application/json", "Content-Type": "application/json"}, - ) - - print("✅ API scraping completed successfully") - print(json.dumps(response, indent=2)) - client.close() - - except Exception as e: - print(f"❌ Error in API scraping: {str(e)}") - - -def main(): - """Run all cookies integration examples.""" - # Check if API key is available - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - print("🍪 COOKIES INTEGRATION EXAMPLES") - print( - "This demonstrates various real-world scenarios where cookies are essential for web scraping." - ) - - # Run all examples - scrape_ecommerce_with_auth() - scrape_social_media_with_session() - scrape_news_with_preferences() - scrape_banking_with_secure_cookies() - scrape_api_with_auth_tokens() - - print("\n" + "=" * 60) - print("✅ All examples completed!") - print("=" * 60) - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/advanced_features/mock/async_mock_mode_example.py b/scrapegraph-py/examples/advanced_features/mock/async_mock_mode_example.py deleted file mode 100644 index 31d3328..0000000 --- a/scrapegraph-py/examples/advanced_features/mock/async_mock_mode_example.py +++ /dev/null @@ -1,61 +0,0 @@ -import asyncio - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - - -sgai_logger.set_logging(level="INFO") - - -async def basic_mock_usage(): - # Initialize the client with mock mode enabled - async with AsyncClient.from_env(mock=True) as client: - print("\n-- get_credits (mock) --") - print(await client.get_credits()) - - print("\n-- markdownify (mock) --") - md = await client.markdownify(website_url="https://example.com") - print(md) - - print("\n-- get_markdownify (mock) --") - md_status = await client.get_markdownify("00000000-0000-0000-0000-000000000123") - print(md_status) - - print("\n-- smartscraper (mock) --") - ss = await client.smartscraper(user_prompt="Extract title", website_url="https://example.com") - print(ss) - - -async def mock_with_path_overrides(): - # Initialize the client with mock mode and custom responses - async with AsyncClient.from_env( - mock=True, - mock_responses={ - "/v1/credits": {"remaining_credits": 42, "total_credits_used": 58} - }, - ) as client: - print("\n-- get_credits with override (mock) --") - print(await client.get_credits()) - - -async def mock_with_custom_handler(): - def handler(method, url, kwargs): - return {"handled_by": "custom_handler", "method": method, "url": url} - - # Initialize the client with mock mode and custom handler - async with AsyncClient.from_env(mock=True, mock_handler=handler) as client: - print("\n-- searchscraper via custom handler (mock) --") - resp = await client.searchscraper(user_prompt="Search something") - print(resp) - - -async def main(): - await basic_mock_usage() - await mock_with_path_overrides() - await mock_with_custom_handler() - - -if __name__ == "__main__": - asyncio.run(main()) - - diff --git a/scrapegraph-py/examples/advanced_features/mock/mock_mode_example.py b/scrapegraph-py/examples/advanced_features/mock/mock_mode_example.py deleted file mode 100644 index c2bc8b1..0000000 --- a/scrapegraph-py/examples/advanced_features/mock/mock_mode_example.py +++ /dev/null @@ -1,58 +0,0 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - - -sgai_logger.set_logging(level="INFO") - - -def basic_mock_usage(): - # Initialize the client with mock mode enabled - client = Client.from_env(mock=True) - - print("\n-- get_credits (mock) --") - print(client.get_credits()) - - print("\n-- markdownify (mock) --") - md = client.markdownify(website_url="https://example.com") - print(md) - - print("\n-- get_markdownify (mock) --") - md_status = client.get_markdownify("00000000-0000-0000-0000-000000000123") - print(md_status) - - print("\n-- smartscraper (mock) --") - ss = client.smartscraper(user_prompt="Extract title", website_url="https://example.com") - print(ss) - - -def mock_with_path_overrides(): - # Initialize the client with mock mode and custom responses - client = Client.from_env( - mock=True, - mock_responses={ - "/v1/credits": {"remaining_credits": 42, "total_credits_used": 58} - }, - ) - - print("\n-- get_credits with override (mock) --") - print(client.get_credits()) - - -def mock_with_custom_handler(): - def handler(method, url, kwargs): - return {"handled_by": "custom_handler", "method": method, "url": url} - - # Initialize the client with mock mode and custom handler - client = Client.from_env(mock=True, mock_handler=handler) - - print("\n-- searchscraper via custom handler (mock) --") - resp = client.searchscraper(user_prompt="Search something") - print(resp) - - -if __name__ == "__main__": - basic_mock_usage() - mock_with_path_overrides() - mock_with_custom_handler() - - diff --git a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_agenticscraper_example.py b/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_agenticscraper_example.py deleted file mode 100644 index e858703..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_agenticscraper_example.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python3 -""" -Async Step-by-Step AgenticScraper Example - -This example demonstrates how to use the AgenticScraper API asynchronously -for automated browser interactions with proper async/await patterns. -""" - -import asyncio -import json -import os -import time - -import aiohttp -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - - -async def agentic_scraper_request(): - """Example of making an async request to the agentic scraper API""" - - # Get API key from .env file - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - raise ValueError( - "API key must be provided or set in .env file as SGAI_API_KEY. " - "Create a .env file with: SGAI_API_KEY=your_api_key_here" - ) - - steps = [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ] - website_url = "https://dashboard.scrapegraphai.com/" - - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - } - - body = { - "url": website_url, - "use_session": True, - "steps": steps, - } - - print("🤖 Starting Async Agentic Scraper with Automated Actions...") - print(f"🌐 Website URL: {website_url}") - print(f"🔧 Use Session: True") - print(f"📋 Steps: {len(steps)} automated actions") - print("\n" + "=" * 60) - - # Start timer - start_time = time.time() - print( - f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" - ) - print("🔄 Processing request asynchronously...") - - try: - async with aiohttp.ClientSession() as session: - async with session.post( - "http://localhost:8001/v1/agentic-scrapper", - json=body, - headers=headers, - ) as response: - # Calculate execution time - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print( - f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} steps" - ) - - if response.status == 200: - result = await response.json() - print("✅ Request completed successfully!") - print(f"📊 Request ID: {result.get('request_id', 'N/A')}") - print(f"🔄 Status: {result.get('status', 'N/A')}") - - if result.get("error"): - print(f"❌ Error: {result['error']}") - else: - print("\n📋 EXTRACTED DATA:") - print("=" * 60) - - # Pretty print the result with proper indentation - if "result" in result: - print(json.dumps(result["result"], indent=2, ensure_ascii=False)) - else: - print("No result data found") - - else: - response_text = await response.text() - print(f"❌ Request failed with status code: {response.status}") - print(f"Response: {response_text}") - - except aiohttp.ClientError as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"🌐 Network error: {str(e)}") - except Exception as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"💥 Unexpected error: {str(e)}") - - -def show_curl_equivalent(): - """Show the equivalent curl command for reference""" - - # Load environment variables from .env file - load_dotenv() - - api_key = os.getenv("SGAI_API_KEY", "your-api-key-here") - curl_command = f""" -curl --location 'http://localhost:8001/v1/agentic-scrapper' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Content-Type: application/json' \\ ---data-raw '{{ - "url": "https://dashboard.scrapegraphai.com/", - "use_session": true, - "steps": [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ] -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -async def main(): - """Main async function to run the agentic scraper example""" - try: - print("🤖 ASYNC AGENTIC SCRAPER EXAMPLE") - print("=" * 60) - print("This example demonstrates async automated browser interactions") - print() - - # Show the curl equivalent - show_curl_equivalent() - - print("\n" + "=" * 60) - - # Make the actual API request - await agentic_scraper_request() - - print("\n" + "=" * 60) - print("Example completed!") - print("\nKey takeaways:") - print("1. Async agentic scraper enables non-blocking automation") - print("2. Each step is executed sequentially but asynchronously") - print("3. Session management allows for complex workflows") - print("4. Perfect for concurrent automation tasks") - print("\nNext steps:") - print("- Run multiple agentic scrapers concurrently") - print("- Combine with other async operations") - print("- Implement async error handling") - print("- Use async session management for efficiency") - - except Exception as e: - print(f"💥 Error occurred: {str(e)}") - print("\n🛠️ Troubleshooting:") - print("1. Make sure your .env file contains SGAI_API_KEY") - print("2. Ensure the API server is running on localhost:8001") - print("3. Check your internet connection") - print("4. Verify the target website is accessible") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_cookies_example.py b/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_cookies_example.py deleted file mode 100644 index e40e654..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_cookies_example.py +++ /dev/null @@ -1,358 +0,0 @@ -#!/usr/bin/env python3 -""" -Async Step-by-Step Cookies Example - -This example demonstrates how to use cookies with SmartScraper API using async/await patterns. -It shows how to set up and execute requests with custom cookies for authentication and session management. -""" - -import asyncio -import json -import logging -import os -import time - -import httpx -from dotenv import load_dotenv - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) -logger = logging.getLogger(__name__) - -# Load environment variables from .env file -load_dotenv() - - -async def step_1_environment_setup(): - """Step 1: Set up environment and API key""" - print("STEP 1: Environment Setup") - print("=" * 40) - - # Check if API key is available - api_key = os.getenv("TEST_API_KEY") - if not api_key: - print("❌ Error: TEST_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export TEST_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: TEST_API_KEY=your-api-key-here") - return None - - print("✅ API key found in environment") - print(f"🔑 API Key: {api_key[:8]}...{api_key[-4:]}") - return api_key - - -async def step_2_server_connectivity_check(api_key): - """Step 2: Check server connectivity""" - print("\nSTEP 2: Server Connectivity Check") - print("=" * 40) - - url = "http://localhost:8001/v1/smartscraper" - - try: - async with httpx.AsyncClient(timeout=5.0) as client: - # Try to access the health endpoint - health_url = url.replace("/v1/smartscraper", "/healthz") - response = await client.get(health_url) - - if response.status_code == 200: - print("✅ Server is accessible") - print(f"🔗 Health endpoint: {health_url}") - return True - else: - print( - f"❌ Server health check failed with status {response.status_code}" - ) - return False - except Exception as e: - print(f"❌ Server connectivity check failed: {e}") - print("Please ensure the server is running:") - print(" poetry run uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload") - return False - - -def step_3_define_cookies(): - """Step 3: Define cookies for authentication""" - print("\nSTEP 3: Define Cookies") - print("=" * 40) - - # Example cookies for a website that requires authentication - cookies = { - "session_id": "abc123def456ghi789", - "user_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", - "remember_me": "true", - "language": "en", - "theme": "dark", - } - - print("🍪 Cookies configured:") - for key, value in cookies.items(): - if "token" in key.lower(): - # Mask sensitive tokens - masked_value = value[:20] + "..." if len(value) > 20 else value - print(f" {key}: {masked_value}") - else: - print(f" {key}: {value}") - - print(f"\n📊 Total cookies: {len(cookies)}") - return cookies - - -def step_4_define_request_parameters(): - """Step 4: Define the request parameters""" - print("\nSTEP 4: Define Request Parameters") - print("=" * 40) - - # Configuration parameters - website_url = "https://example.com/dashboard" - user_prompt = "Extract user profile information and account details" - - print("🌐 Website URL:") - print(f" {website_url}") - print("\n📝 User Prompt:") - print(f" {user_prompt}") - print("\n🎯 Goal: Access authenticated content using cookies") - - return {"website_url": website_url, "user_prompt": user_prompt} - - -def step_5_prepare_headers(api_key): - """Step 5: Prepare request headers""" - print("\nSTEP 5: Prepare Request Headers") - print("=" * 40) - - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", - "Accept": "application/json", - "Accept-Language": "en-US,en;q=0.9", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - } - - print("📋 Headers configured:") - for key, value in headers.items(): - if key == "SGAI-APIKEY": - print(f" {key}: {value[:10]}...{value[-10:]}") # Mask API key - else: - print(f" {key}: {value}") - - return headers - - -async def step_6_execute_cookies_request(headers, cookies, config): - """Step 6: Execute the request with cookies""" - print("\nSTEP 6: Execute Request with Cookies") - print("=" * 40) - - url = "http://localhost:8001/v1/smartscraper" - - # Request payload with cookies - payload = { - "website_url": config["website_url"], - "user_prompt": config["user_prompt"], - "output_schema": {}, - "cookies": cookies, - } - - print("🚀 Starting request with cookies...") - print("🍪 Using authentication cookies for access...") - - try: - # Start timing - start_time = time.time() - - # Use timeout for cookies requests - async with httpx.AsyncClient(timeout=120.0) as client: - response = await client.post(url, headers=headers, json=payload) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print(f"📊 Response Status: {response.status_code}") - - if response.status_code == 200: - result = response.json() - return result, duration - else: - print(f"❌ Request failed with status {response.status_code}") - print(f"Response: {response.text}") - return None, duration - - except httpx.TimeoutException: - duration = time.time() - start_time - print(f"❌ Request timed out after {duration:.2f} seconds (>120s timeout)") - print("This may indicate authentication issues or slow response.") - return None, duration - - except httpx.RequestError as e: - duration = time.time() - start_time - print(f"❌ Request error after {duration:.2f} seconds: {e}") - print("Common causes:") - print(" - Server is not running") - print(" - Invalid cookies") - print(" - Network connectivity issues") - return None, duration - - except Exception as e: - duration = time.time() - start_time - print(f"❌ Unexpected error after {duration:.2f} seconds: {e}") - return None, duration - - -def step_7_process_results(result, duration): - """Step 7: Process and display the results""" - print("\nSTEP 7: Process Results") - print("=" * 40) - - if result is None: - print("❌ No results to process") - return - - print("📋 Processing authenticated results...") - - # Display results based on type - if isinstance(result, dict): - print("\n🔍 Response Structure:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - # Check for authentication success indicators - if "result" in result: - print("\n✨ Authentication successful! Data extracted with cookies") - - elif isinstance(result, list): - print(f"\n✅ Authentication successful! Extracted {len(result)} items") - - # Show first few items - print("\n📦 Sample Results:") - for i, item in enumerate(result[:3]): # Show first 3 items - print(f" {i+1}. {item}") - - if len(result) > 3: - print(f" ... and {len(result) - 3} more items") - - else: - print(f"\n📋 Result: {result}") - - print(f"\n⏱️ Total processing time: {duration:.2f} seconds") - - -def step_8_show_curl_equivalent(api_key, cookies, config): - """Step 8: Show equivalent curl command""" - print("\nSTEP 8: Equivalent curl Command") - print("=" * 40) - - # Convert cookies dict to curl format - cookies_str = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - - curl_command = f""" -curl --location 'http://localhost:8001/v1/smartscraper' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Content-Type: application/json' \\ ---header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36' \\ ---header 'Accept: application/json' \\ ---header 'Accept-Language: en-US,en;q=0.9' \\ ---header 'Accept-Encoding: gzip, deflate, br' \\ ---header 'Connection: keep-alive' \\ ---cookie '{cookies_str}' \\ ---data '{{ - "website_url": "{config['website_url']}", - "user_prompt": "{config['user_prompt']}", - "output_schema": {{}}, - "cookies": {json.dumps(cookies)} -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -def step_9_cookie_management_tips(): - """Step 9: Provide cookie management tips""" - print("\nSTEP 9: Cookie Management Tips") - print("=" * 40) - - print("🍪 Best Practices for Cookie Management:") - print("1. 🔐 Store sensitive cookies securely (environment variables)") - print("2. ⏰ Set appropriate expiration times") - print("3. 🧹 Clean up expired cookies regularly") - print("4. 🔄 Refresh tokens before they expire") - print("5. 🛡️ Use HTTPS for cookie transmission") - print("6. 📝 Log cookie usage for debugging") - print("7. 🚫 Don't hardcode cookies in source code") - print("8. 🔍 Validate cookie format before sending") - - -async def main(): - """Main function to run the async step-by-step cookies example""" - total_start_time = time.time() - logger.info("Starting Async Step-by-Step Cookies Example") - - print("ScrapeGraph SDK - Async Step-by-Step Cookies Example") - print("=" * 60) - print("This example shows the complete async process of setting up and") - print("executing requests with cookies for authentication") - print("=" * 60) - - # Step 1: Environment setup - api_key = await step_1_environment_setup() - if not api_key: - return - - # Step 2: Server connectivity check - server_ok = await step_2_server_connectivity_check(api_key) - if not server_ok: - return - - # Step 3: Define cookies - cookies = step_3_define_cookies() - - # Step 4: Define request parameters - config = step_4_define_request_parameters() - - # Step 5: Prepare headers - headers = step_5_prepare_headers(api_key) - - # Step 6: Execute request - result, duration = await step_6_execute_cookies_request(headers, cookies, config) - - # Step 7: Process results - step_7_process_results(result, duration) - - # Step 8: Show curl equivalent - step_8_show_curl_equivalent(api_key, cookies, config) - - # Step 9: Cookie management tips - step_9_cookie_management_tips() - - total_duration = time.time() - total_start_time - logger.info( - f"Example completed! Total execution time: {total_duration:.2f} seconds" - ) - - print("\n" + "=" * 60) - print("Async step-by-step cookies example completed!") - print(f"⏱️ Total execution time: {total_duration:.2f} seconds") - print("\nKey takeaways:") - print("1. Async/await provides better performance for I/O operations") - print("2. Cookies enable access to authenticated content") - print("3. Always validate API key and server connectivity first") - print("4. Secure cookie storage is crucial for production use") - print("5. Handle authentication errors gracefully") - print("6. Use equivalent curl commands for testing") - print("\nNext steps:") - print("- Implement secure cookie storage") - print("- Add cookie refresh logic") - print("- Handle authentication failures") - print("- Monitor cookie expiration") - print("- Implement retry logic for failed requests") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_movements_example.py b/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_movements_example.py deleted file mode 100644 index 5966329..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_movements_example.py +++ /dev/null @@ -1,479 +0,0 @@ -#!/usr/bin/env python3 -""" -Async Step-by-Step SmartScraper Movements Example - -This example demonstrates how to use interactive movements with SmartScraper API -using async/await patterns for better performance and concurrency. -""" - -import asyncio -import json -import logging -import os -import time - -import httpx -from dotenv import load_dotenv - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) -logger = logging.getLogger(__name__) - - -async def check_server_connectivity(base_url: str) -> bool: - """Check if the server is running and accessible""" - try: - async with httpx.AsyncClient(timeout=5.0) as client: - # Try to access the health endpoint - health_url = base_url.replace("/v1/smartscraper", "/healthz") - response = await client.get(health_url) - return response.status_code == 200 - except Exception: - return False - - -async def async_smart_scraper_movements(): - """Async example of making a movements request to the smartscraper API""" - - # Load environment variables from .env file - load_dotenv() - - # Get API key from .env file - api_key = os.getenv("TEST_API_KEY") - if not api_key: - raise ValueError( - "API key must be provided or set in .env file as TEST_API_KEY. " - "Create a .env file with: TEST_API_KEY=your_api_key_here" - ) - - steps = [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search", - ] - website_url = "https://github.com/" - user_prompt = "Extract user profile" - - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - } - - body = { - "website_url": website_url, - "user_prompt": user_prompt, - "output_schema": {}, - "steps": steps, - } - - print("🚀 Starting Async Smart Scraper with Interactive Movements...") - print(f"🌐 Website URL: {website_url}") - print(f"🎯 User Prompt: {user_prompt}") - print(f"📋 Steps: {len(steps)} interactive steps") - print("\n" + "=" * 60) - - # Start timer - start_time = time.time() - print( - f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" - ) - print("🔄 Processing async request...") - - try: - # Use longer timeout for movements requests as they may take more time - async with httpx.AsyncClient(timeout=300.0) as client: - response = await client.post( - "http://localhost:8001/v1/smartscraper", - json=body, - headers=headers, - ) - - # Calculate execution time - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print( - f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} steps" - ) - - if response.status_code == 200: - result = response.json() - print("✅ Request completed successfully!") - print(f"📊 Request ID: {result.get('request_id', 'N/A')}") - print(f"🔄 Status: {result.get('status', 'N/A')}") - - if result.get("error"): - print(f"❌ Error: {result['error']}") - else: - print("\n📋 EXTRACTED DATA:") - print("=" * 60) - - # Pretty print the result with proper indentation - if "result" in result: - print( - json.dumps(result["result"], indent=2, ensure_ascii=False) - ) - else: - print("No result data found") - - else: - print(f"❌ Request failed with status code: {response.status_code}") - print(f"Response: {response.text}") - - except httpx.TimeoutException: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before timeout: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print("⏰ Request timed out after 300 seconds") - except httpx.RequestError as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"🌐 Network error: {str(e)}") - except Exception as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"💥 Unexpected error: {str(e)}") - - -async def async_markdownify_movements(): - """ - Async enhanced markdownify function with comprehensive features and timing. - - Note: Markdownify doesn't support interactive movements like Smart Scraper. - Instead, it excels at converting websites to clean markdown format. - """ - # Load environment variables from .env file - load_dotenv() - - # Get API key from .env file - api_key = os.getenv("TEST_API_KEY") - if not api_key: - raise ValueError( - "API key must be provided or set in .env file as TEST_API_KEY. " - "Create a .env file with: TEST_API_KEY=your_api_key_here" - ) - - steps = [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search", - ] - - # Target website configuration - website_url = "https://scrapegraphai.com/" - - # Enhanced headers for better scraping (similar to interactive movements) - custom_headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - } - - # Prepare API request headers - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - } - - # Request body for markdownify - body = { - "website_url": website_url, - "headers": custom_headers, - "steps": steps, - } - - print("🚀 Starting Async Markdownify with Enhanced Features...") - print(f"🌐 Website URL: {website_url}") - print(f"📋 Custom Headers: {len(custom_headers)} headers configured") - print("🎯 Goal: Convert website to clean markdown format") - print("\n" + "=" * 60) - - # Start timer - start_time = time.time() - print( - f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" - ) - print("🔄 Processing async markdown conversion...") - - try: - async with httpx.AsyncClient(timeout=120.0) as client: - response = await client.post( - "http://localhost:8001/v1/markdownify", - json=body, - headers=headers, - ) - - # Calculate execution time - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print( - f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for markdown conversion" - ) - - if response.status_code == 200: - result = response.json() - markdown_content = result.get("result", "") - - print("✅ Request completed successfully!") - print(f"📊 Request ID: {result.get('request_id', 'N/A')}") - print(f"🔄 Status: {result.get('status', 'N/A')}") - print(f"📝 Content Length: {len(markdown_content)} characters") - - if result.get("error"): - print(f"❌ Error: {result['error']}") - else: - print("\n📋 MARKDOWN CONVERSION RESULTS:") - print("=" * 60) - - # Display markdown statistics - lines = markdown_content.split("\n") - words = len(markdown_content.split()) - - print("📊 Statistics:") - print(f" - Total Lines: {len(lines)}") - print(f" - Total Words: {words}") - print(f" - Total Characters: {len(markdown_content)}") - print( - f" - Processing Speed: {len(markdown_content)/execution_time:.0f} chars/second" - ) - - # Display first 500 characters - print("\n🔍 First 500 characters:") - print("-" * 50) - print(markdown_content[:500]) - if len(markdown_content) > 500: - print("...") - print("-" * 50) - - # Save to file - filename = f"async_markdownify_output_{int(time.time())}.md" - await save_markdown_to_file_async(markdown_content, filename) - - # Display content analysis - analyze_markdown_content(markdown_content) - - else: - print(f"❌ Request failed with status code: {response.status_code}") - print(f"Response: {response.text}") - - except httpx.TimeoutException: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before timeout: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print("⏰ Request timed out after 120 seconds") - except httpx.RequestError as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"🌐 Network error: {str(e)}") - except Exception as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"💥 Unexpected error: {str(e)}") - - -async def save_markdown_to_file_async(markdown_content: str, filename: str): - """ - Save markdown content to a file with enhanced error handling (async version). - - Args: - markdown_content: The markdown content to save - filename: The name of the file to save to - """ - try: - # Use asyncio to run the file operation in a thread pool - await asyncio.to_thread(_write_file_sync, markdown_content, filename) - print(f"💾 Markdown saved to: {filename}") - except Exception as e: - print(f"❌ Error saving file: {str(e)}") - - -def _write_file_sync(markdown_content: str, filename: str): - """Synchronous file writing function for asyncio.to_thread""" - with open(filename, "w", encoding="utf-8") as f: - f.write(markdown_content) - - -def analyze_markdown_content(markdown_content: str): - """ - Analyze the markdown content and provide insights. - - Args: - markdown_content: The markdown content to analyze - """ - print("\n🔍 CONTENT ANALYSIS:") - print("-" * 50) - - # Count different markdown elements - lines = markdown_content.split("\n") - headers = [line for line in lines if line.strip().startswith("#")] - links = [line for line in lines if "[" in line and "](" in line] - code_blocks = markdown_content.count("```") - - print(f"📑 Headers found: {len(headers)}") - print(f"🔗 Links found: {len(links)}") - print( - f"💻 Code blocks: {code_blocks // 2}" - ) # Divide by 2 since each block has opening and closing - - # Show first few headers if they exist - if headers: - print("\n📋 First few headers:") - for i, header in enumerate(headers[:3]): - print(f" {i+1}. {header.strip()}") - if len(headers) > 3: - print(f" ... and {len(headers) - 3} more") - - -def show_curl_equivalent(): - """Show the equivalent curl command for reference""" - - # Load environment variables from .env file - load_dotenv() - - api_key = os.getenv("TEST_API_KEY", "your-api-key-here") - curl_command = f""" -curl --location 'http://localhost:8001/v1/smartscraper' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Content-Type: application/json' \\ ---data '{{ - "website_url": "https://github.com/", - "user_prompt": "Extract user profile", - "output_schema": {{}}, - "steps": [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search" - ] -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -async def main(): - """Main function to run the async movements examples""" - total_start_time = time.time() - logger.info("Starting Async SmartScraper Movements Examples") - - try: - print("🎯 ASYNC SMART SCRAPER MOVEMENTS EXAMPLES") - print("=" * 60) - print("This example demonstrates async interactive movements with timing") - print() - - # Show the curl equivalent - show_curl_equivalent() - - print("\n" + "=" * 60) - - # Make the actual API requests - print("1️⃣ Running SmartScraper Movements Example...") - await async_smart_scraper_movements() - - print("\n" + "=" * 60) - print("2️⃣ Running Markdownify Movements Example...") - await async_markdownify_movements() - - total_duration = time.time() - total_start_time - logger.info( - f"Examples completed! Total execution time: {total_duration:.2f} seconds" - ) - - print("\n" + "=" * 60) - print("Examples completed!") - print(f"⏱️ Total execution time: {total_duration:.2f} seconds") - print("\nKey takeaways:") - print("1. Async/await provides better performance for I/O operations") - print("2. Movements allow for interactive browser automation") - print("3. Each step is executed sequentially") - print("4. Timing is crucial for successful interactions") - print("5. Error handling is important for robust automation") - print("\nNext steps:") - print("- Customize the steps for your specific use case") - print("- Add more complex interactions") - print("- Implement retry logic for failed steps") - print("- Use structured output schemas for better data extraction") - - except Exception as e: - print(f"💥 Error occurred: {str(e)}") - print("\n🛠️ Troubleshooting:") - print("1. Make sure your .env file contains TEST_API_KEY") - print("2. Ensure the API server is running on localhost:8001") - print("3. Check your internet connection") - print("4. Verify the target website is accessible") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_pagination_example.py b/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_pagination_example.py deleted file mode 100644 index d4177f5..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_pagination_example.py +++ /dev/null @@ -1,315 +0,0 @@ -#!/usr/bin/env python3 -""" -Async Step-by-Step Pagination Example - -This example demonstrates the pagination process step by step using async/await patterns, -showing each stage of setting up and executing a paginated SmartScraper request. -""" - -import asyncio -import json -import logging -import os -import time - -import httpx -from dotenv import load_dotenv - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) -logger = logging.getLogger(__name__) - -# Load environment variables from .env file -load_dotenv() - - -async def step_1_environment_setup(): - """Step 1: Set up environment and API key""" - print("STEP 1: Environment Setup") - print("=" * 40) - - # Check if API key is available - api_key = os.getenv("TEST_API_KEY") - if not api_key: - print("❌ Error: TEST_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export TEST_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: TEST_API_KEY=your-api-key-here") - return None - - print("✅ API key found in environment") - print(f"🔑 API Key: {api_key[:8]}...{api_key[-4:]}") - return api_key - - -async def step_2_server_connectivity_check(api_key): - """Step 2: Check server connectivity""" - print("\nSTEP 2: Server Connectivity Check") - print("=" * 40) - - url = "http://localhost:8001/v1/smartscraper" - - try: - async with httpx.AsyncClient(timeout=5.0) as client: - # Try to access the health endpoint - health_url = url.replace("/v1/smartscraper", "/healthz") - response = await client.get(health_url) - - if response.status_code == 200: - print("✅ Server is accessible") - print(f"🔗 Health endpoint: {health_url}") - return True - else: - print( - f"❌ Server health check failed with status {response.status_code}" - ) - return False - except Exception as e: - print(f"❌ Server connectivity check failed: {e}") - print("Please ensure the server is running:") - print(" poetry run uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload") - return False - - -def step_3_define_request_parameters(): - """Step 3: Define the request parameters""" - print("\nSTEP 3: Define Request Parameters") - print("=" * 40) - - # Configuration parameters - website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2" - user_prompt = "Extract all product info including name, price, rating, image_url, and description" - total_pages = 3 - - print("🌐 Website URL:") - print(f" {website_url}") - print("\n📝 User Prompt:") - print(f" {user_prompt}") - print(f"\n📄 Total Pages: {total_pages}") - print(f"📊 Expected Products: ~{total_pages * 20} (estimated)") - - return { - "website_url": website_url, - "user_prompt": user_prompt, - "total_pages": total_pages, - } - - -def step_4_prepare_headers(api_key): - """Step 4: Prepare request headers""" - print("\nSTEP 4: Prepare Request Headers") - print("=" * 40) - - headers = { - "sec-ch-ua-platform": '"macOS"', - "SGAI-APIKEY": api_key, - "Referer": "https://dashboard.scrapegraphai.com/", - "sec-ch-ua": '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"', - "sec-ch-ua-mobile": "?0", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", - "Accept": "application/json", - "Content-Type": "application/json", - } - - print("📋 Headers configured:") - for key, value in headers.items(): - if key == "SGAI-APIKEY": - print(f" {key}: {value[:10]}...{value[-10:]}") # Mask API key - else: - print(f" {key}: {value}") - - return headers - - -async def step_5_execute_pagination_request(headers, config): - """Step 5: Execute the pagination request""" - print("\nSTEP 5: Execute Pagination Request") - print("=" * 40) - - url = "http://localhost:8001/v1/smartscraper" - - # Request payload with pagination - payload = { - "website_url": config["website_url"], - "user_prompt": config["user_prompt"], - "output_schema": {}, - "total_pages": config["total_pages"], - } - - print("🚀 Starting pagination request...") - print("⏱️ This may take several minutes for multiple pages...") - - try: - # Start timing - start_time = time.time() - - # Use longer timeout for pagination requests as they may take more time - async with httpx.AsyncClient(timeout=600.0) as client: - response = await client.post(url, headers=headers, json=payload) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print(f"📊 Response Status: {response.status_code}") - - if response.status_code == 200: - result = response.json() - return result, duration - else: - print(f"❌ Request failed with status {response.status_code}") - print(f"Response: {response.text}") - return None, duration - - except httpx.TimeoutException: - duration = time.time() - start_time - print(f"❌ Request timed out after {duration:.2f} seconds (>600s timeout)") - print( - "This may indicate the server is taking too long to process the pagination request." - ) - return None, duration - - except httpx.RequestError as e: - duration = time.time() - start_time - print(f"❌ Request error after {duration:.2f} seconds: {e}") - print("Common causes:") - print(" - Server is not running") - print(" - Wrong port (check server logs)") - print(" - Network connectivity issues") - return None, duration - - except Exception as e: - duration = time.time() - start_time - print(f"❌ Unexpected error after {duration:.2f} seconds: {e}") - return None, duration - - -def step_6_process_results(result, duration): - """Step 6: Process and display the results""" - print("\nSTEP 6: Process Results") - print("=" * 40) - - if result is None: - print("❌ No results to process") - return - - print("📋 Processing pagination results...") - - # Display results based on type - if isinstance(result, dict): - print("\n🔍 Response Structure:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - # Check for pagination success indicators - if "data" in result: - print("\n✨ Pagination successful! Data extracted from multiple pages") - - elif isinstance(result, list): - print(f"\n✅ Pagination successful! Extracted {len(result)} items") - - # Show first few items - print("\n📦 Sample Results:") - for i, item in enumerate(result[:3]): # Show first 3 items - print(f" {i+1}. {item}") - - if len(result) > 3: - print(f" ... and {len(result) - 3} more items") - - else: - print(f"\n📋 Result: {result}") - - print(f"\n⏱️ Total processing time: {duration:.2f} seconds") - - -def step_7_show_curl_equivalent(api_key, config): - """Step 7: Show equivalent curl command""" - print("\nSTEP 7: Equivalent curl Command") - print("=" * 40) - - curl_command = f""" -curl --location 'http://localhost:8001/v1/smartscraper' \\ ---header 'sec-ch-ua-platform: "macOS"' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Referer: https://dashboard.scrapegraphai.com/' \\ ---header 'sec-ch-ua: "Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"' \\ ---header 'sec-ch-ua-mobile: ?0' \\ ---header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36' \\ ---header 'Accept: application/json' \\ ---header 'Content-Type: application/json' \\ ---data '{{ - "website_url": "{config['website_url']}", - "user_prompt": "{config['user_prompt']}", - "output_schema": {{}}, - "total_pages": {config['total_pages']} -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -async def main(): - """Main function to run the async step-by-step pagination example""" - total_start_time = time.time() - logger.info("Starting Async Step-by-Step Pagination Example") - - print("ScrapeGraph SDK - Async Step-by-Step Pagination Example") - print("=" * 60) - print("This example shows the complete async process of setting up and") - print("executing a pagination request with SmartScraper API") - print("=" * 60) - - # Step 1: Environment setup - api_key = await step_1_environment_setup() - if not api_key: - return - - # Step 2: Server connectivity check - server_ok = await step_2_server_connectivity_check(api_key) - if not server_ok: - return - - # Step 3: Define request parameters - config = step_3_define_request_parameters() - - # Step 4: Prepare headers - headers = step_4_prepare_headers(api_key) - - # Step 5: Execute request - result, duration = await step_5_execute_pagination_request(headers, config) - - # Step 6: Process results - step_6_process_results(result, duration) - - # Step 7: Show curl equivalent - step_7_show_curl_equivalent(api_key, config) - - total_duration = time.time() - total_start_time - logger.info( - f"Example completed! Total execution time: {total_duration:.2f} seconds" - ) - - print("\n" + "=" * 60) - print("Async step-by-step pagination example completed!") - print(f"⏱️ Total execution time: {total_duration:.2f} seconds") - print("\nKey takeaways:") - print("1. Async/await provides better performance for I/O operations") - print("2. Always validate your API key and server connectivity first") - print("3. Define clear request parameters for structured data") - print("4. Configure pagination parameters carefully") - print("5. Handle errors gracefully with proper timeouts") - print("6. Use equivalent curl commands for testing") - print("\nNext steps:") - print("- Try different websites and prompts") - print("- Experiment with different page counts") - print("- Add error handling for production use") - print("- Consider rate limiting for large requests") - print("- Implement retry logic for failed requests") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_scrape_example.py b/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_scrape_example.py deleted file mode 100644 index 41894b9..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/async_step_by_step_scrape_example.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -Async step-by-step example demonstrating how to use the Scrape API with the scrapegraph-py async SDK. - -This example shows the basic async workflow: -1. Initialize the async client -2. Make a scrape request asynchronously -3. Handle the response -4. Save the HTML content -5. Basic analysis - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- aiohttp -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -import os -from pathlib import Path -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -async def step_1_initialize_async_client(): - """Step 1: Initialize the scrapegraph-py async client.""" - print("🔑 Step 1: Initializing async client...") - - try: - # Initialize async client using environment variable - client = AsyncClient.from_env() - print("✅ Async client initialized successfully") - return client - except Exception as e: - print(f"❌ Failed to initialize async client: {str(e)}") - print("Make sure you have SGAI_API_KEY in your .env file") - raise - - -async def step_2_make_async_scrape_request(client, url, render_js=False): - """Step 2: Make a scrape request asynchronously.""" - print(f"\n🌐 Step 2: Making async scrape request to {url}") - print(f"🔧 Render heavy JS: {render_js}") - - try: - # Make the scrape request asynchronously - result = await client.scrape( - website_url=url, - render_heavy_js=render_js - ) - print("✅ Async scrape request completed successfully") - return result - except Exception as e: - print(f"❌ Async scrape request failed: {str(e)}") - raise - - -def step_3_handle_response(result): - """Step 3: Handle and analyze the response.""" - print(f"\n📊 Step 3: Analyzing response...") - - # Check if we got HTML content - html_content = result.get("html", "") - if not html_content: - print("❌ No HTML content received") - return None - - # Basic response analysis - print(f"✅ Received HTML content") - print(f"📏 Content length: {len(html_content):,} characters") - print(f"📄 Lines: {len(html_content.splitlines()):,}") - - # Check for common HTML elements - has_doctype = html_content.strip().startswith(" 0: - print(f" {element}: {count}") - - # Check for JavaScript and CSS - has_js = elements["script"] > 0 - has_css = elements["style"] > 0 - - print(f"\n🎨 Content types:") - print(f" JavaScript: {'Yes' if has_js else 'No'}") - print(f" CSS: {'Yes' if has_css else 'No'}") - - return elements - - -async def main(): - """Main function demonstrating async step-by-step scrape usage.""" - print("🚀 Async Step-by-Step Scrape API Example") - print("=" * 55) - - # Test URL - test_url = "https://example.com" - - try: - # Step 1: Initialize async client - async with AsyncClient.from_env() as client: - print("✅ Async client initialized successfully") - - # Step 2: Make async scrape request - result = await step_2_make_async_scrape_request(client, test_url, render_js=False) - - # Step 3: Handle response - html_content = step_3_handle_response(result) - if not html_content: - print("❌ Cannot proceed without HTML content") - return - - # Step 4: Save content - filename = "async_example_website" - saved_file = step_4_save_html_content(html_content, filename) - - # Step 5: Basic analysis - elements = step_5_basic_analysis(html_content) - - # Summary - print(f"\n🎯 Summary:") - print(f"✅ Successfully processed {test_url} asynchronously") - print(f"💾 HTML saved to: {saved_file}") - print(f"📊 Analyzed {len(html_content):,} characters of HTML content") - - print("✅ Async client closed successfully") - - except Exception as e: - print(f"\n💥 Error occurred: {str(e)}") - print("Check your API key and internet connection") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/advanced_features/steps/step_by_step_agenticscraper_example.py b/scrapegraph-py/examples/advanced_features/steps/step_by_step_agenticscraper_example.py deleted file mode 100644 index d82ad2b..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/step_by_step_agenticscraper_example.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python3 -""" -Step-by-Step AgenticScraper Example - -This example demonstrates how to use the AgenticScraper API for automated browser interactions. -It shows how to make actual HTTP requests with step-by-step browser actions. -""" - -import json -import os -import time - -import requests -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - - -def agentic_scraper_request(): - """Example of making a request to the agentic scraper API""" - - # Get API key from .env file - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - raise ValueError( - "API key must be provided or set in .env file as SGAI_API_KEY. " - "Create a .env file with: SGAI_API_KEY=your_api_key_here" - ) - - steps = [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ] - website_url = "https://dashboard.scrapegraphai.com/" - - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - } - - body = { - "url": website_url, - "use_session": True, - "steps": steps, - } - - print("🤖 Starting Agentic Scraper with Automated Actions...") - print(f"🌐 Website URL: {website_url}") - print(f"🔧 Use Session: True") - print(f"📋 Steps: {len(steps)} automated actions") - print("\n" + "=" * 60) - - # Start timer - start_time = time.time() - print( - f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" - ) - print("🔄 Processing request...") - - try: - response = requests.post( - "http://localhost:8001/v1/agentic-scrapper", - json=body, - headers=headers, - ) - - # Calculate execution time - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print( - f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} steps" - ) - - if response.status_code == 200: - result = response.json() - print("✅ Request completed successfully!") - print(f"📊 Request ID: {result.get('request_id', 'N/A')}") - print(f"🔄 Status: {result.get('status', 'N/A')}") - - if result.get("error"): - print(f"❌ Error: {result['error']}") - else: - print("\n📋 EXTRACTED DATA:") - print("=" * 60) - - # Pretty print the result with proper indentation - if "result" in result: - print(json.dumps(result["result"], indent=2, ensure_ascii=False)) - else: - print("No result data found") - - else: - print(f"❌ Request failed with status code: {response.status_code}") - print(f"Response: {response.text}") - - except requests.exceptions.RequestException as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"🌐 Network error: {str(e)}") - except Exception as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"💥 Unexpected error: {str(e)}") - - -def show_curl_equivalent(): - """Show the equivalent curl command for reference""" - - # Load environment variables from .env file - load_dotenv() - - api_key = os.getenv("SGAI_API_KEY", "your-api-key-here") - curl_command = f""" -curl --location 'http://localhost:8001/v1/agentic-scrapper' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Content-Type: application/json' \\ ---data-raw '{{ - "url": "https://dashboard.scrapegraphai.com/", - "use_session": true, - "steps": [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ] -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -def main(): - """Main function to run the agentic scraper example""" - try: - print("🤖 AGENTIC SCRAPER EXAMPLE") - print("=" * 60) - print("This example demonstrates automated browser interactions") - print() - - # Show the curl equivalent - show_curl_equivalent() - - print("\n" + "=" * 60) - - # Make the actual API request - agentic_scraper_request() - - print("\n" + "=" * 60) - print("Example completed!") - print("\nKey takeaways:") - print("1. Agentic scraper enables automated browser actions") - print("2. Each step is executed sequentially") - print("3. Session management allows for complex workflows") - print("4. Perfect for login flows and form interactions") - print("\nNext steps:") - print("- Customize the steps for your specific use case") - print("- Add more complex automation sequences") - print("- Implement error handling for failed actions") - print("- Use session management for multi-step workflows") - - except Exception as e: - print(f"💥 Error occurred: {str(e)}") - print("\n🛠️ Troubleshooting:") - print("1. Make sure your .env file contains SGAI_API_KEY") - print("2. Ensure the API server is running on localhost:8001") - print("3. Check your internet connection") - print("4. Verify the target website is accessible") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/advanced_features/steps/step_by_step_cookies_example.py b/scrapegraph-py/examples/advanced_features/steps/step_by_step_cookies_example.py deleted file mode 100644 index 4ebc734..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/step_by_step_cookies_example.py +++ /dev/null @@ -1,377 +0,0 @@ -#!/usr/bin/env python3 -""" -Step-by-Step Cookies Example - -This example demonstrates the cookies integration process step by step, showing each stage -of setting up and executing a SmartScraper request with cookies for authentication. -""" - -import json -import os -import time -from typing import Dict, Optional - -from dotenv import load_dotenv -from pydantic import BaseModel, Field - -from scrapegraph_py import Client -from scrapegraph_py.exceptions import APIError - -# Load environment variables from .env file -load_dotenv() - - -class CookieInfo(BaseModel): - """Model representing cookie information.""" - - cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") - - -class UserProfile(BaseModel): - """Model representing user profile information.""" - - username: str = Field(description="User's username") - email: Optional[str] = Field(description="User's email address") - preferences: Optional[Dict[str, str]] = Field(description="User preferences") - - -def step_1_environment_setup(): - """Step 1: Set up environment and API key""" - print("STEP 1: Environment Setup") - print("=" * 40) - - # Check if API key is available - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return None - - print("✅ API key found in environment") - print(f"🔑 API Key: {api_key[:8]}...{api_key[-4:]}") - return api_key - - -def step_2_client_initialization(api_key): - """Step 2: Initialize the ScrapeGraph client""" - print("\nSTEP 2: Client Initialization") - print("=" * 40) - - try: - client = Client(api_key=api_key) - print("✅ Client initialized successfully") - print(f"🔧 Client type: {type(client)}") - return client - except Exception as e: - print(f"❌ Error initializing client: {e}") - return None - - -def step_3_define_schema(): - """Step 3: Define the output schema for structured data""" - print("\nSTEP 3: Define Output Schema") - print("=" * 40) - - print("📋 Defining Pydantic models for structured output:") - print(" - CookieInfo: Cookie information structure") - print(" - UserProfile: User profile data (for authenticated requests)") - - # Show the schema structure - schema_example = CookieInfo.model_json_schema() - print(f"✅ Schema defined with {len(schema_example['properties'])} properties") - - return CookieInfo - - -def step_4_prepare_cookies(): - """Step 4: Prepare cookies for authentication""" - print("\nSTEP 4: Prepare Cookies") - print("=" * 40) - - # Example cookies for different scenarios - print("🍪 Preparing cookies for authentication...") - - # Basic test cookies - basic_cookies = {"cookies_key": "cookies_value", "test_cookie": "test_value"} - - # Session cookies - session_cookies = {"session_id": "abc123def456", "user_token": "xyz789ghi012"} - - # Authentication cookies - auth_cookies = { - "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", - "user_id": "user123", - "csrf_token": "csrf_abc123", - } - - print("📋 Available cookie sets:") - print(f" 1. Basic cookies: {len(basic_cookies)} items") - print(f" 2. Session cookies: {len(session_cookies)} items") - print(f" 3. Auth cookies: {len(auth_cookies)} items") - - # Use basic cookies for this example - selected_cookies = basic_cookies - print(f"\n✅ Using basic cookies: {selected_cookies}") - - return selected_cookies - - -def step_5_format_cookies_for_headers(cookies): - """Step 5: Format cookies for HTTP headers""" - print("\nSTEP 5: Format Cookies for Headers") - print("=" * 40) - - print("🔧 Converting cookies dictionary to HTTP Cookie header...") - - # Convert cookies dict to Cookie header string - cookie_header = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - - # Create headers dictionary - headers = {"Cookie": cookie_header} - - print("📋 Cookie formatting:") - print(f" Original cookies: {cookies}") - print(f" Cookie header: {cookie_header}") - print(f" Headers dict: {headers}") - - return headers - - -def step_6_configure_request(): - """Step 6: Configure the request parameters""" - print("\nSTEP 6: Configure Request Parameters") - print("=" * 40) - - # Configuration parameters - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies info" - - print("🌐 Website URL:") - print(f" {website_url}") - print("\n📝 User Prompt:") - print(f" {user_prompt}") - print("\n🔧 Additional Features:") - print(" - Cookies authentication") - print(" - Structured output schema") - - return {"website_url": website_url, "user_prompt": user_prompt} - - -def step_7_execute_request(client, config, headers, output_schema): - """Step 7: Execute the request with cookies""" - print("\nSTEP 7: Execute Request with Cookies") - print("=" * 40) - - print("🚀 Starting request with cookies...") - print("🍪 Cookies will be sent in HTTP headers") - - try: - # Start timing - start_time = time.time() - - # Perform the scraping with cookies - result = client.smartscraper( - website_url=config["website_url"], - user_prompt=config["user_prompt"], - headers=headers, - output_schema=output_schema, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print(f"📊 Response type: {type(result)}") - - return result, duration - - except APIError as e: - print(f"❌ API Error: {e}") - print("This could be due to:") - print(" - Invalid API key") - print(" - Rate limiting") - print(" - Server issues") - return None, 0 - - except Exception as e: - print(f"❌ Unexpected error: {e}") - print("This could be due to:") - print(" - Network connectivity issues") - print(" - Invalid website URL") - print(" - Cookie format issues") - return None, 0 - - -def step_8_process_results(result, duration, cookies): - """Step 8: Process and display the results""" - print("\nSTEP 8: Process Results") - print("=" * 40) - - if result is None: - print("❌ No results to process") - return - - print("📋 Processing cookies response...") - - # Display results - if isinstance(result, dict): - print("\n🔍 Response Structure:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - # Check if cookies were received correctly - if "cookies" in result: - received_cookies = result["cookies"] - print(f"\n🍪 Cookies sent: {cookies}") - print(f"🍪 Cookies received: {received_cookies}") - - # Verify cookies match - if received_cookies == cookies: - print("✅ Cookies match perfectly!") - else: - print("⚠️ Cookies don't match exactly (this might be normal)") - - elif isinstance(result, list): - print(f"\n✅ Request successful! Extracted {len(result)} items") - print("\n📦 Results:") - for i, item in enumerate(result[:3]): # Show first 3 items - print(f" {i+1}. {item}") - - if len(result) > 3: - print(f" ... and {len(result) - 3} more items") - - else: - print(f"\n📋 Result: {result}") - - print(f"\n⏱️ Total processing time: {duration:.2f} seconds") - - -def step_9_test_different_scenarios(client, output_schema): - """Step 9: Test different cookie scenarios""" - print("\nSTEP 9: Test Different Cookie Scenarios") - print("=" * 40) - - scenarios = [ - { - "name": "Session Cookies", - "cookies": {"session_id": "abc123", "user_token": "xyz789"}, - "description": "Basic session management", - }, - { - "name": "Authentication Cookies", - "cookies": {"auth_token": "secret123", "preferences": "dark_mode"}, - "description": "User authentication and preferences", - }, - { - "name": "Complex Cookies", - "cookies": { - "session_id": "abc123def456", - "user_id": "user789", - "cart_id": "cart101112", - "preferences": "dark_mode,usd", - }, - "description": "E-commerce scenario with multiple cookies", - }, - ] - - for i, scenario in enumerate(scenarios, 1): - print(f"\n🧪 Testing Scenario {i}: {scenario['name']}") - print(f" Description: {scenario['description']}") - print(f" Cookies: {scenario['cookies']}") - - # Format cookies for headers - cookie_header = "; ".join([f"{k}={v}" for k, v in scenario["cookies"].items()]) - headers = {"Cookie": cookie_header} - - try: - # Quick test request - result = client.smartscraper( - website_url="https://httpbin.org/cookies", - user_prompt=f"Extract cookies for {scenario['name']}", - headers=headers, - output_schema=output_schema, - ) - print(f" ✅ Success: {type(result)}") - except Exception as e: - print(f" ❌ Error: {str(e)[:50]}...") - - -def step_10_cleanup(client): - """Step 10: Clean up resources""" - print("\nSTEP 10: Cleanup") - print("=" * 40) - - try: - client.close() - print("✅ Client session closed successfully") - print("🔒 Resources freed") - except Exception as e: - print(f"⚠️ Warning during cleanup: {e}") - - -def main(): - """Main function to run the step-by-step cookies example""" - - print("ScrapeGraph SDK - Step-by-Step Cookies Example") - print("=" * 60) - print("This example shows the complete process of setting up and") - print("executing a SmartScraper request with cookies for authentication") - print("=" * 60) - - # Step 1: Environment setup - api_key = step_1_environment_setup() - if not api_key: - return - - # Step 2: Client initialization - client = step_2_client_initialization(api_key) - if not client: - return - - # Step 3: Define schema - output_schema = step_3_define_schema() - - # Step 4: Prepare cookies - cookies = step_4_prepare_cookies() - - # Step 5: Format cookies for headers - headers = step_5_format_cookies_for_headers(cookies) - - # Step 6: Configure request - config = step_6_configure_request() - - # Step 7: Execute request - result, duration = step_7_execute_request(client, config, headers, output_schema) - - # Step 8: Process results - step_8_process_results(result, duration, cookies) - - # Step 9: Test different scenarios - step_9_test_different_scenarios(client, output_schema) - - # Step 10: Cleanup - step_10_cleanup(client) - - print("\n" + "=" * 60) - print("Step-by-step cookies example completed!") - print("\nKey takeaways:") - print("1. Cookies are passed via HTTP headers") - print("2. Cookie format: 'key1=value1; key2=value2'") - print("3. Always validate your API key first") - print("4. Test different cookie scenarios") - print("5. Handle errors gracefully") - print("\nCommon use cases:") - print("- Authentication for protected pages") - print("- Session management for dynamic content") - print("- User preferences and settings") - print("- Shopping cart and user state") - print("\nNext steps:") - print("- Try with real websites that require authentication") - print("- Experiment with different cookie combinations") - print("- Add error handling for production use") - print("- Consider security implications of storing cookies") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/advanced_features/steps/step_by_step_movements_example.py b/scrapegraph-py/examples/advanced_features/steps/step_by_step_movements_example.py deleted file mode 100644 index b7e40c2..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/step_by_step_movements_example.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python3 -""" -Step-by-Step SmartScraper Movements Example - -This example demonstrates how to use interactive movements with SmartScraper API. -It shows how to make actual HTTP requests with step-by-step browser interactions. -""" - -import json -import os -import time - -import requests -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - - -def smart_scraper_movements(): - """Example of making a movements request to the smartscraper API""" - - # Get API key from .env file - api_key = os.getenv("TEST_API_KEY") - if not api_key: - raise ValueError( - "API key must be provided or set in .env file as TEST_API_KEY. " - "Create a .env file with: TEST_API_KEY=your_api_key_here" - ) - - steps = [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search", - ] - website_url = "https://github.com/" - user_prompt = "Extract user profile" - - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - } - - body = { - "website_url": website_url, - "user_prompt": user_prompt, - "output_schema": {}, - "steps": steps, - } - - print("🚀 Starting Smart Scraper with Interactive Movements...") - print(f"🌐 Website URL: {website_url}") - print(f"🎯 User Prompt: {user_prompt}") - print(f"📋 Steps: {len(steps)} interactive steps") - print("\n" + "=" * 60) - - # Start timer - start_time = time.time() - print( - f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" - ) - print("🔄 Processing request...") - - try: - response = requests.post( - "http://localhost:8001/v1/smartscraper", - json=body, - headers=headers, - ) - - # Calculate execution time - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print( - f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} steps" - ) - - if response.status_code == 200: - result = response.json() - print("✅ Request completed successfully!") - print(f"📊 Request ID: {result.get('request_id', 'N/A')}") - print(f"🔄 Status: {result.get('status', 'N/A')}") - - if result.get("error"): - print(f"❌ Error: {result['error']}") - else: - print("\n📋 EXTRACTED DATA:") - print("=" * 60) - - # Pretty print the result with proper indentation - if "result" in result: - print(json.dumps(result["result"], indent=2, ensure_ascii=False)) - else: - print("No result data found") - - else: - print(f"❌ Request failed with status code: {response.status_code}") - print(f"Response: {response.text}") - - except requests.exceptions.RequestException as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"🌐 Network error: {str(e)}") - except Exception as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"💥 Unexpected error: {str(e)}") - - -def show_curl_equivalent(): - """Show the equivalent curl command for reference""" - - # Load environment variables from .env file - load_dotenv() - - api_key = os.getenv("TEST_API_KEY", "your-api-key-here") - curl_command = f""" -curl --location 'http://localhost:8001/v1/smartscraper' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Content-Type: application/json' \\ ---data '{{ - "website_url": "https://github.com/", - "user_prompt": "Extract user profile", - "output_schema": {{}}, - "steps": [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search" - ] -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -def main(): - """Main function to run the movements example""" - try: - print("🎯 SMART SCRAPER MOVEMENTS EXAMPLE") - print("=" * 60) - print("This example demonstrates interactive movements with timing") - print() - - # Show the curl equivalent - show_curl_equivalent() - - print("\n" + "=" * 60) - - # Make the actual API request - smart_scraper_movements() - - print("\n" + "=" * 60) - print("Example completed!") - print("\nKey takeaways:") - print("1. Movements allow for interactive browser automation") - print("2. Each step is executed sequentially") - print("3. Timing is crucial for successful interactions") - print("4. Error handling is important for robust automation") - print("\nNext steps:") - print("- Customize the steps for your specific use case") - print("- Add more complex interactions") - print("- Implement retry logic for failed steps") - print("- Use structured output schemas for better data extraction") - - except Exception as e: - print(f"💥 Error occurred: {str(e)}") - print("\n🛠️ Troubleshooting:") - print("1. Make sure your .env file contains TEST_API_KEY") - print("2. Ensure the API server is running on localhost:8001") - print("3. Check your internet connection") - print("4. Verify the target website is accessible") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/advanced_features/steps/step_by_step_pagination_example.py b/scrapegraph-py/examples/advanced_features/steps/step_by_step_pagination_example.py deleted file mode 100644 index fb308df..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/step_by_step_pagination_example.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python3 -""" -Step-by-Step Pagination Example - -This example demonstrates the pagination process step by step, showing each stage -of setting up and executing a paginated SmartScraper request. -""" - -import json -import os -import time -from typing import List, Optional - -from dotenv import load_dotenv -from pydantic import BaseModel, Field - -from scrapegraph_py import Client -from scrapegraph_py.exceptions import APIError - -# Load environment variables from .env file -load_dotenv() - - -class ProductInfo(BaseModel): - """Schema for product information""" - - name: str = Field(description="Product name") - price: Optional[str] = Field(description="Product price") - rating: Optional[str] = Field(description="Product rating") - image_url: Optional[str] = Field(description="Product image URL") - description: Optional[str] = Field(description="Product description") - - -class ProductList(BaseModel): - """Schema for list of products""" - - products: List[ProductInfo] = Field(description="List of products") - - -def step_1_environment_setup(): - """Step 1: Set up environment and API key""" - print("STEP 1: Environment Setup") - print("=" * 40) - - # Check if API key is available - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return None - - print("✅ API key found in environment") - print(f"🔑 API Key: {api_key[:8]}...{api_key[-4:]}") - return api_key - - -def step_2_client_initialization(api_key): - """Step 2: Initialize the ScrapeGraph client""" - print("\nSTEP 2: Client Initialization") - print("=" * 40) - - try: - client = Client(api_key=api_key) - print("✅ Client initialized successfully") - print(f"🔧 Client type: {type(client)}") - return client - except Exception as e: - print(f"❌ Error initializing client: {e}") - return None - - -def step_3_define_schema(): - """Step 3: Define the output schema for structured data""" - print("\nSTEP 3: Define Output Schema") - print("=" * 40) - - print("📋 Defining Pydantic models for structured output:") - print(" - ProductInfo: Individual product data") - print(" - ProductList: Collection of products") - - # Show the schema structure - schema_example = ProductList.model_json_schema() - print(f"✅ Schema defined with {len(schema_example['properties'])} properties") - - return ProductList - - -def step_4_configure_request(): - """Step 4: Configure the pagination request parameters""" - print("\nSTEP 4: Configure Request Parameters") - print("=" * 40) - - # Configuration parameters - website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2" - user_prompt = "Extract all product info including name, price, rating, image_url, and description" - total_pages = 3 - - print("🌐 Website URL:") - print(f" {website_url}") - print("\n📝 User Prompt:") - print(f" {user_prompt}") - print(f"\n📄 Total Pages: {total_pages}") - print(f"📊 Expected Products: ~{total_pages * 20} (estimated)") - - return { - "website_url": website_url, - "user_prompt": user_prompt, - "total_pages": total_pages, - } - - -def step_5_execute_request(client, config, output_schema): - """Step 5: Execute the pagination request""" - print("\nSTEP 5: Execute Pagination Request") - print("=" * 40) - - print("🚀 Starting pagination request...") - print("⏱️ This may take several minutes for multiple pages...") - - try: - # Start timing - start_time = time.time() - - # Make the request with pagination - result = client.smartscraper( - user_prompt=config["user_prompt"], - website_url=config["website_url"], - output_schema=output_schema, - total_pages=config["total_pages"], - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print(f"📊 Response type: {type(result)}") - - return result, duration - - except APIError as e: - print(f"❌ API Error: {e}") - print("This could be due to:") - print(" - Invalid API key") - print(" - Rate limiting") - print(" - Server issues") - return None, 0 - - except Exception as e: - print(f"❌ Unexpected error: {e}") - print("This could be due to:") - print(" - Network connectivity issues") - print(" - Invalid website URL") - print(" - Pagination limitations") - return None, 0 - - -def step_6_process_results(result, duration): - """Step 6: Process and display the results""" - print("\nSTEP 6: Process Results") - print("=" * 40) - - if result is None: - print("❌ No results to process") - return - - print("📋 Processing pagination results...") - - # Display results based on type - if isinstance(result, dict): - print("\n🔍 Response Structure:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - # Check for pagination success indicators - if "data" in result: - print("\n✨ Pagination successful! Data extracted from multiple pages") - - elif isinstance(result, list): - print(f"\n✅ Pagination successful! Extracted {len(result)} items") - - # Show first few items - print("\n📦 Sample Results:") - for i, item in enumerate(result[:3]): # Show first 3 items - print(f" {i+1}. {item}") - - if len(result) > 3: - print(f" ... and {len(result) - 3} more items") - - else: - print(f"\n📋 Result: {result}") - - print(f"\n⏱️ Total processing time: {duration:.2f} seconds") - - -def step_7_cleanup(client): - """Step 7: Clean up resources""" - print("\nSTEP 7: Cleanup") - print("=" * 40) - - try: - client.close() - print("✅ Client session closed successfully") - print("🔒 Resources freed") - except Exception as e: - print(f"⚠️ Warning during cleanup: {e}") - - -def main(): - """Main function to run the step-by-step pagination example""" - - print("ScrapeGraph SDK - Step-by-Step Pagination Example") - print("=" * 60) - print("This example shows the complete process of setting up and") - print("executing a pagination request with SmartScraper API") - print("=" * 60) - - # Step 1: Environment setup - api_key = step_1_environment_setup() - if not api_key: - return - - # Step 2: Client initialization - client = step_2_client_initialization(api_key) - if not client: - return - - # Step 3: Define schema - output_schema = step_3_define_schema() - - # Step 4: Configure request - config = step_4_configure_request() - - # Step 5: Execute request - result, duration = step_5_execute_request(client, config, output_schema) - - # Step 6: Process results - step_6_process_results(result, duration) - - # Step 7: Cleanup - step_7_cleanup(client) - - print("\n" + "=" * 60) - print("Step-by-step pagination example completed!") - print("\nKey takeaways:") - print("1. Always validate your API key first") - print("2. Define clear output schemas for structured data") - print("3. Configure pagination parameters carefully") - print("4. Handle errors gracefully") - print("5. Clean up resources after use") - print("\nNext steps:") - print("- Try different websites and prompts") - print("- Experiment with different page counts") - print("- Add error handling for production use") - print("- Consider rate limiting for large requests") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/advanced_features/steps/step_by_step_scrape_example.py b/scrapegraph-py/examples/advanced_features/steps/step_by_step_scrape_example.py deleted file mode 100644 index 8627dfe..0000000 --- a/scrapegraph-py/examples/advanced_features/steps/step_by_step_scrape_example.py +++ /dev/null @@ -1,183 +0,0 @@ -""" -Step-by-step example demonstrating how to use the Scrape API with the scrapegraph-py SDK. - -This example shows the basic workflow: -1. Initialize the client -2. Make a scrape request -3. Handle the response -4. Save the HTML content -5. Basic analysis - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import os -from pathlib import Path -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def step_1_initialize_client(): - """Step 1: Initialize the scrapegraph-py client.""" - print("🔑 Step 1: Initializing client...") - - try: - # Initialize client using environment variable - client = Client.from_env() - print("✅ Client initialized successfully") - return client - except Exception as e: - print(f"❌ Failed to initialize client: {str(e)}") - print("Make sure you have SGAI_API_KEY in your .env file") - raise - - -def step_2_make_scrape_request(client, url, render_js=False): - """Step 2: Make a scrape request.""" - print(f"\n🌐 Step 2: Making scrape request to {url}") - print(f"🔧 Render heavy JS: {render_js}") - - try: - # Make the scrape request - result = client.scrape( - website_url=url, - render_heavy_js=render_js - ) - print("✅ Scrape request completed successfully") - return result - except Exception as e: - print(f"❌ Scrape request failed: {str(e)}") - raise - - -def step_3_handle_response(result): - """Step 3: Handle and analyze the response.""" - print(f"\n📊 Step 3: Analyzing response...") - - # Check if we got HTML content - html_content = result.get("html", "") - if not html_content: - print("❌ No HTML content received") - return None - - # Basic response analysis - print(f"✅ Received HTML content") - print(f"📏 Content length: {len(html_content):,} characters") - print(f"📄 Lines: {len(html_content.splitlines()):,}") - - # Check for common HTML elements - has_doctype = html_content.strip().startswith(" 0: - print(f" {element}: {count}") - - # Check for JavaScript and CSS - has_js = elements["script"] > 0 - has_css = elements["style"] > 0 - - print(f"\n🎨 Content types:") - print(f" JavaScript: {'Yes' if has_js else 'No'}") - print(f" CSS: {'Yes' if has_css else 'No'}") - - return elements - - -def main(): - """Main function demonstrating step-by-step scrape usage.""" - print("🚀 Step-by-Step Scrape API Example") - print("=" * 50) - - # Test URL - test_url = "https://example.com" - - try: - # Step 1: Initialize client - client = step_1_initialize_client() - - # Step 2: Make scrape request - result = step_2_make_scrape_request(client, test_url, render_js=False) - - # Step 3: Handle response - html_content = step_3_handle_response(result) - if not html_content: - print("❌ Cannot proceed without HTML content") - return - - # Step 4: Save content - filename = "example_website" - saved_file = step_4_save_html_content(html_content, filename) - - # Step 5: Basic analysis - elements = step_5_basic_analysis(html_content) - - # Summary - print(f"\n🎯 Summary:") - print(f"✅ Successfully processed {test_url}") - print(f"💾 HTML saved to: {saved_file}") - print(f"📊 Analyzed {len(html_content):,} characters of HTML content") - - # Close client - client.close() - print("🔒 Client closed successfully") - - except Exception as e: - print(f"\n💥 Error occurred: {str(e)}") - print("Check your API key and internet connection") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/agenticscraper/async/async_agenticscraper_comprehensive_example.py b/scrapegraph-py/examples/agenticscraper/async/async_agenticscraper_comprehensive_example.py deleted file mode 100644 index c72da78..0000000 --- a/scrapegraph-py/examples/agenticscraper/async/async_agenticscraper_comprehensive_example.py +++ /dev/null @@ -1,458 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive Async Agentic Scraper Example - -This example demonstrates how to use the agentic scraper API endpoint -asynchronously to perform automated browser actions and scrape content -with both AI extraction and non-AI extraction modes. - -The agentic scraper can: -1. Navigate to a website -2. Perform a series of automated actions (like filling forms, clicking buttons) -3. Extract the resulting HTML content as markdown -4. Optionally use AI to extract structured data - -Usage: - python examples/async/async_agenticscraper_comprehensive_example.py -""" - -import asyncio -import json -import os -import time -from typing import Dict, List, Optional - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -# Set logging level -sgai_logger.set_logging(level="INFO") - - -async def example_basic_scraping_no_ai(): - """Example: Basic agentic scraping without AI extraction.""" - - # Initialize the async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return None - - async with AsyncClient(api_key=api_key) as client: - # Define the steps to perform - steps = [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login", - ] - - try: - print("🚀 Starting basic async agentic scraping (no AI extraction)...") - print(f"URL: https://dashboard.scrapegraphai.com/") - print(f"Steps: {steps}") - - # Perform the scraping without AI extraction - result = await client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - steps=steps, - use_session=True, - ai_extraction=False # No AI extraction - just get raw markdown - ) - - print("✅ Basic async scraping completed successfully!") - print(f"Request ID: {result.get('request_id')}") - - # Save the markdown content to a file - if result.get("markdown"): - with open("async_basic_scraped_content.md", "w", encoding="utf-8") as f: - f.write(result["markdown"]) - print("📄 Markdown content saved to 'async_basic_scraped_content.md'") - - # Print a preview of the content - if result.get("markdown"): - preview = ( - result["markdown"][:500] + "..." - if len(result["markdown"]) > 500 - else result["markdown"] - ) - print(f"\n📝 Content Preview:\n{preview}") - - if result.get("error"): - print(f"⚠️ Warning: {result['error']}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def example_ai_extraction(): - """Example: Use AI extraction to get structured data from dashboard.""" - - # Initialize the async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return None - - async with AsyncClient(api_key=api_key) as client: - # Define extraction schema for user dashboard information - output_schema = { - "user_info": { - "type": "object", - "properties": { - "username": {"type": "string"}, - "email": {"type": "string"}, - "dashboard_sections": { - "type": "array", - "items": {"type": "string"} - }, - "account_status": {"type": "string"}, - "credits_remaining": {"type": "number"} - }, - "required": ["username", "dashboard_sections"] - } - } - - steps = [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login", - "wait for dashboard to load completely", - ] - - try: - print("🤖 Starting async agentic scraping with AI extraction...") - print(f"URL: https://dashboard.scrapegraphai.com/") - print(f"Steps: {steps}") - - result = await client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - steps=steps, - use_session=True, - user_prompt="Extract user information, available dashboard sections, account status, and remaining credits from the dashboard", - output_schema=output_schema, - ai_extraction=True - ) - - print("✅ Async AI extraction completed!") - print(f"Request ID: {result.get('request_id')}") - - if result.get("result"): - print("🎯 Extracted Structured Data:") - print(json.dumps(result["result"], indent=2)) - - # Save extracted data to JSON file - with open("async_extracted_dashboard_data.json", "w", encoding="utf-8") as f: - json.dump(result["result"], f, indent=2) - print("💾 Structured data saved to 'async_extracted_dashboard_data.json'") - - # Also save the raw markdown if available - if result.get("markdown"): - with open("async_ai_scraped_content.md", "w", encoding="utf-8") as f: - f.write(result["markdown"]) - print("📄 Raw markdown also saved to 'async_ai_scraped_content.md'") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def example_multiple_sites_concurrently(): - """Example: Scrape multiple sites concurrently with different extraction modes.""" - - # Initialize the async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return None - - async with AsyncClient(api_key=api_key) as client: - # Define different scraping tasks - tasks = [ - { - "name": "Dashboard Login (No AI)", - "url": "https://dashboard.scrapegraphai.com/", - "steps": [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ], - "ai_extraction": False - }, - { - "name": "Product Page (With AI)", - "url": "https://example-store.com/products/laptop", - "steps": [ - "scroll down to product details", - "click on specifications tab", - "scroll down to reviews section" - ], - "ai_extraction": True, - "user_prompt": "Extract product name, price, specifications, and customer review summary", - "output_schema": { - "product": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "price": {"type": "string"}, - "specifications": {"type": "object"}, - "review_summary": { - "type": "object", - "properties": { - "average_rating": {"type": "number"}, - "total_reviews": {"type": "number"} - } - } - } - } - } - }, - { - "name": "News Article (With AI)", - "url": "https://example-news.com/tech-article", - "steps": [ - "scroll down to read full article", - "click on related articles section" - ], - "ai_extraction": True, - "user_prompt": "Extract article title, author, publication date, main content summary, and related article titles", - "output_schema": { - "article": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "author": {"type": "string"}, - "publication_date": {"type": "string"}, - "summary": {"type": "string"}, - "related_articles": { - "type": "array", - "items": {"type": "string"} - } - } - } - } - } - ] - - async def scrape_site(task): - """Helper function to scrape a single site.""" - try: - print(f"🚀 Starting: {task['name']}") - - kwargs = { - "url": task["url"], - "steps": task["steps"], - "use_session": True, - "ai_extraction": task["ai_extraction"] - } - - if task["ai_extraction"]: - kwargs["user_prompt"] = task["user_prompt"] - kwargs["output_schema"] = task["output_schema"] - - result = await client.agenticscraper(**kwargs) - - print(f"✅ Completed: {task['name']} (Request ID: {result.get('request_id')})") - return { - "task_name": task["name"], - "result": result, - "success": True - } - - except Exception as e: - print(f"❌ Failed: {task['name']} - {str(e)}") - return { - "task_name": task["name"], - "error": str(e), - "success": False - } - - try: - print("🔄 Starting concurrent scraping of multiple sites...") - print(f"📊 Total tasks: {len(tasks)}") - - # Run all scraping tasks concurrently - results = await asyncio.gather( - *[scrape_site(task) for task in tasks], - return_exceptions=True - ) - - print("\n📋 Concurrent Scraping Results:") - print("=" * 50) - - successful_results = [] - failed_results = [] - - for result in results: - if isinstance(result, Exception): - print(f"❌ Exception occurred: {str(result)}") - failed_results.append({"error": str(result)}) - elif result["success"]: - print(f"✅ {result['task_name']}: Success") - successful_results.append(result) - - # Save individual results - filename = f"concurrent_{result['task_name'].lower().replace(' ', '_').replace('(', '').replace(')', '')}_result.json" - with open(filename, "w", encoding="utf-8") as f: - json.dump(result["result"], f, indent=2) - print(f" 💾 Saved to: {filename}") - else: - print(f"❌ {result['task_name']}: Failed - {result['error']}") - failed_results.append(result) - - # Save summary - summary = { - "total_tasks": len(tasks), - "successful": len(successful_results), - "failed": len(failed_results), - "success_rate": f"{(len(successful_results) / len(tasks)) * 100:.1f}%", - "results": results - } - - with open("concurrent_scraping_summary.json", "w", encoding="utf-8") as f: - json.dump(summary, f, indent=2) - print(f"\n📊 Summary saved to: concurrent_scraping_summary.json") - print(f" Success Rate: {summary['success_rate']}") - - return results - - except Exception as e: - print(f"❌ Concurrent scraping error: {str(e)}") - return None - - -async def example_step_by_step_with_ai(): - """Example: Step-by-step form interaction with AI extraction.""" - - # Initialize the async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return None - - async with AsyncClient(api_key=api_key) as client: - steps = [ - "navigate to contact page", - "fill in name field with 'Jane Smith'", - "fill in email field with 'jane.smith@company.com'", - "select 'Business Inquiry' from dropdown", - "fill in message: 'I would like to discuss enterprise pricing options for 100+ users'", - "click on terms and conditions checkbox", - "click submit button", - "wait for success message and capture any reference number" - ] - - output_schema = { - "contact_form_result": { - "type": "object", - "properties": { - "submission_status": {"type": "string"}, - "success_message": {"type": "string"}, - "reference_number": {"type": "string"}, - "next_steps": {"type": "string"}, - "contact_info": {"type": "string"}, - "estimated_response_time": {"type": "string"} - }, - "required": ["submission_status", "success_message"] - } - } - - try: - print("📝 Starting step-by-step form interaction with AI extraction...") - print(f"URL: https://example-business.com/contact") - print(f"Steps: {len(steps)} steps defined") - - result = await client.agenticscraper( - url="https://example-business.com/contact", - steps=steps, - use_session=True, - user_prompt="Extract the form submission result including status, success message, any reference number provided, next steps mentioned, contact information for follow-up, and estimated response time", - output_schema=output_schema, - ai_extraction=True - ) - - print("✅ Step-by-step form interaction completed!") - print(f"Request ID: {result.get('request_id')}") - - if result and result.get("result"): - form_result = result["result"].get("contact_form_result", {}) - - print("\n📋 Form Submission Analysis:") - print(f" 📊 Status: {form_result.get('submission_status', 'Unknown')}") - print(f" ✅ Message: {form_result.get('success_message', 'No message')}") - - if form_result.get('reference_number'): - print(f" 🔢 Reference: {form_result['reference_number']}") - - if form_result.get('next_steps'): - print(f" 👉 Next Steps: {form_result['next_steps']}") - - if form_result.get('contact_info'): - print(f" 📞 Contact Info: {form_result['contact_info']}") - - if form_result.get('estimated_response_time'): - print(f" ⏰ Response Time: {form_result['estimated_response_time']}") - - # Save detailed results - with open("async_step_by_step_form_result.json", "w", encoding="utf-8") as f: - json.dump(result["result"], f, indent=2) - print("\n💾 Detailed results saved to 'async_step_by_step_form_result.json'") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def main(): - """Main async function to run all examples.""" - print("🔧 Comprehensive Async Agentic Scraper Examples") - print("=" * 60) - - # Check if API key is set - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("⚠️ Please set your SGAI_API_KEY environment variable before running!") - print("You can either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - print("\n1. Basic Async Scraping (No AI Extraction)") - print("-" * 50) - await example_basic_scraping_no_ai() - - print("\n\n2. Async AI Extraction Example - Dashboard Data") - print("-" * 50) - await example_ai_extraction() - - print("\n\n3. Concurrent Multi-Site Scraping") - print("-" * 50) - # Uncomment to run concurrent scraping example - # await example_multiple_sites_concurrently() - - print("\n\n4. Step-by-Step Form Interaction with AI") - print("-" * 50) - # Uncomment to run step-by-step form example - # await example_step_by_step_with_ai() - - print("\n✨ Async examples completed!") - print("\nℹ️ Note: Some examples are commented out by default.") - print(" Uncomment them in the main function to run additional examples.") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/agenticscraper/async/async_agenticscraper_example.py b/scrapegraph-py/examples/agenticscraper/async/async_agenticscraper_example.py deleted file mode 100644 index 5b3edc9..0000000 --- a/scrapegraph-py/examples/agenticscraper/async/async_agenticscraper_example.py +++ /dev/null @@ -1,93 +0,0 @@ -import asyncio -import os - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -async def main(): - # Initialize async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - sgai_client = AsyncClient(api_key=api_key) - - print("🤖 Example 1: Basic Async Agentic Scraping (No AI Extraction)") - print("=" * 60) - - # AgenticScraper request - basic automated login example (no AI) - response = await sgai_client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - use_session=True, - steps=[ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ], - ai_extraction=False # No AI extraction - just get raw content - ) - - # Print the response - print(f"Request ID: {response['request_id']}") - print(f"Result: {response.get('result', 'No result yet')}") - print(f"Status: {response.get('status', 'Unknown')}") - - print("\n\n🧠 Example 2: Async Agentic Scraping with AI Extraction") - print("=" * 60) - - # Define schema for AI extraction - output_schema = { - "dashboard_info": { - "type": "object", - "properties": { - "username": {"type": "string"}, - "email": {"type": "string"}, - "dashboard_sections": { - "type": "array", - "items": {"type": "string"} - }, - "credits_remaining": {"type": "number"} - }, - "required": ["username", "dashboard_sections"] - } - } - - # AgenticScraper request with AI extraction - ai_response = await sgai_client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - use_session=True, - steps=[ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login", - "wait for dashboard to load completely" - ], - user_prompt="Extract user information, available dashboard sections, and remaining credits from the dashboard", - output_schema=output_schema, - ai_extraction=True - ) - - # Print the AI extraction response - print(f"AI Request ID: {ai_response['request_id']}") - print(f"AI Result: {ai_response.get('result', 'No result yet')}") - print(f"AI Status: {ai_response.get('status', 'Unknown')}") - print(f"User Prompt: Extract user information, available dashboard sections, and remaining credits") - print(f"Schema Provided: {'Yes' if output_schema else 'No'}") - - await sgai_client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/agenticscraper/sync/agenticscraper_comprehensive_example.py b/scrapegraph-py/examples/agenticscraper/sync/agenticscraper_comprehensive_example.py deleted file mode 100644 index c1e7754..0000000 --- a/scrapegraph-py/examples/agenticscraper/sync/agenticscraper_comprehensive_example.py +++ /dev/null @@ -1,397 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive Agentic Scraper Example - -This example demonstrates how to use the agentic scraper API endpoint -to perform automated browser actions and scrape content with both -AI extraction and non-AI extraction modes. - -The agentic scraper can: -1. Navigate to a website -2. Perform a series of automated actions (like filling forms, clicking buttons) -3. Extract the resulting HTML content as markdown -4. Optionally use AI to extract structured data - -Usage: - python examples/sync/agenticscraper_comprehensive_example.py -""" - -import json -import os -import time -from typing import Dict, List, Optional - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -# Set logging level -sgai_logger.set_logging(level="INFO") - - -def example_basic_scraping_no_ai(): - """Example: Basic agentic scraping without AI extraction.""" - - # Initialize the client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return None - - client = Client(api_key=api_key) - - # Define the steps to perform - steps = [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login", - ] - - try: - print("🚀 Starting basic agentic scraping (no AI extraction)...") - print(f"URL: https://dashboard.scrapegraphai.com/") - print(f"Steps: {steps}") - - # Perform the scraping without AI extraction - result = client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - steps=steps, - use_session=True, - ai_extraction=False # No AI extraction - just get raw markdown - ) - - print("✅ Basic scraping completed successfully!") - print(f"Request ID: {result.get('request_id')}") - - # Save the markdown content to a file - if result.get("markdown"): - with open("basic_scraped_content.md", "w", encoding="utf-8") as f: - f.write(result["markdown"]) - print("📄 Markdown content saved to 'basic_scraped_content.md'") - - # Print a preview of the content - if result.get("markdown"): - preview = ( - result["markdown"][:500] + "..." - if len(result["markdown"]) > 500 - else result["markdown"] - ) - print(f"\n📝 Content Preview:\n{preview}") - - if result.get("error"): - print(f"⚠️ Warning: {result['error']}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def example_ai_extraction(): - """Example: Use AI extraction to get structured data from dashboard.""" - - # Initialize the client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return None - - client = Client(api_key=api_key) - - # Define extraction schema for user dashboard information - output_schema = { - "user_info": { - "type": "object", - "properties": { - "username": {"type": "string"}, - "email": {"type": "string"}, - "dashboard_sections": { - "type": "array", - "items": {"type": "string"} - }, - "account_status": {"type": "string"}, - "credits_remaining": {"type": "number"} - }, - "required": ["username", "dashboard_sections"] - } - } - - steps = [ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login", - "wait for dashboard to load completely", - ] - - try: - print("🤖 Starting agentic scraping with AI extraction...") - print(f"URL: https://dashboard.scrapegraphai.com/") - print(f"Steps: {steps}") - - result = client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - steps=steps, - use_session=True, - user_prompt="Extract user information, available dashboard sections, account status, and remaining credits from the dashboard", - output_schema=output_schema, - ai_extraction=True - ) - - print("✅ AI extraction completed!") - print(f"Request ID: {result.get('request_id')}") - - if result.get("result"): - print("🎯 Extracted Structured Data:") - print(json.dumps(result["result"], indent=2)) - - # Save extracted data to JSON file - with open("extracted_dashboard_data.json", "w", encoding="utf-8") as f: - json.dump(result["result"], f, indent=2) - print("💾 Structured data saved to 'extracted_dashboard_data.json'") - - # Also save the raw markdown if available - if result.get("markdown"): - with open("ai_scraped_content.md", "w", encoding="utf-8") as f: - f.write(result["markdown"]) - print("📄 Raw markdown also saved to 'ai_scraped_content.md'") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def example_ecommerce_product_scraping(): - """Example: Scraping an e-commerce site for product information.""" - - # Initialize the client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return None - - client = Client(api_key=api_key) - - steps = [ - "click on search box", - "type 'laptop' in search box", - "press enter", - "wait for search results to load", - "scroll down 3 times to load more products", - ] - - output_schema = { - "products": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "price": {"type": "string"}, - "rating": {"type": "number"}, - "availability": {"type": "string"}, - "description": {"type": "string"}, - "image_url": {"type": "string"} - }, - "required": ["name", "price"] - } - }, - "search_info": { - "type": "object", - "properties": { - "total_results": {"type": "number"}, - "search_term": {"type": "string"}, - "page": {"type": "number"} - } - } - } - - try: - print("🛒 Scraping e-commerce products with AI extraction...") - print(f"URL: https://example-ecommerce.com") - print(f"Steps: {steps}") - - result = client.agenticscraper( - url="https://example-ecommerce.com", - steps=steps, - use_session=True, - user_prompt="Extract all visible product information including names, prices, ratings, availability status, descriptions, and image URLs. Also extract search metadata like total results and current page.", - output_schema=output_schema, - ai_extraction=True - ) - - print("✅ E-commerce scraping completed!") - print(f"Request ID: {result.get('request_id')}") - - if result and result.get("result"): - products = result["result"].get("products", []) - search_info = result["result"].get("search_info", {}) - - print(f"🔍 Search Results for '{search_info.get('search_term', 'laptop')}':") - print(f"📊 Total Results: {search_info.get('total_results', 'Unknown')}") - print(f"📄 Current Page: {search_info.get('page', 'Unknown')}") - print(f"🛍️ Products Found: {len(products)}") - - print("\n📦 Product Details:") - for i, product in enumerate(products[:5], 1): # Show first 5 products - print(f"\n{i}. {product.get('name', 'N/A')}") - print(f" 💰 Price: {product.get('price', 'N/A')}") - print(f" ⭐ Rating: {product.get('rating', 'N/A')}") - print(f" 📦 Availability: {product.get('availability', 'N/A')}") - if product.get('description'): - desc = product['description'][:100] + "..." if len(product['description']) > 100 else product['description'] - print(f" 📝 Description: {desc}") - - # Save extracted data - with open("ecommerce_products.json", "w", encoding="utf-8") as f: - json.dump(result["result"], f, indent=2) - print("\n💾 Product data saved to 'ecommerce_products.json'") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def example_form_filling_and_data_extraction(): - """Example: Fill out a contact form and extract confirmation details.""" - - # Initialize the client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return None - - client = Client(api_key=api_key) - - steps = [ - "find and click on contact form", - "type 'John Doe' in name field", - "type 'john.doe@example.com' in email field", - "type 'Product Inquiry' in subject field", - "type 'I am interested in your premium plan. Could you provide more details about pricing and features?' in message field", - "click submit button", - "wait for confirmation message to appear", - ] - - output_schema = { - "form_submission": { - "type": "object", - "properties": { - "status": {"type": "string"}, - "confirmation_message": {"type": "string"}, - "reference_number": {"type": "string"}, - "estimated_response_time": {"type": "string"}, - "submitted_data": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "email": {"type": "string"}, - "subject": {"type": "string"} - } - } - }, - "required": ["status", "confirmation_message"] - } - } - - try: - print("📝 Filling contact form and extracting confirmation...") - print(f"URL: https://example-company.com/contact") - print(f"Steps: {steps}") - - result = client.agenticscraper( - url="https://example-company.com/contact", - steps=steps, - use_session=True, - user_prompt="Extract the form submission status, confirmation message, any reference numbers, estimated response time, and echo back the submitted form data", - output_schema=output_schema, - ai_extraction=True - ) - - print("✅ Form submission and extraction completed!") - print(f"Request ID: {result.get('request_id')}") - - if result and result.get("result"): - form_data = result["result"].get("form_submission", {}) - - print(f"📋 Form Submission Results:") - print(f" ✅ Status: {form_data.get('status', 'Unknown')}") - print(f" 💬 Message: {form_data.get('confirmation_message', 'No message')}") - - if form_data.get('reference_number'): - print(f" 🔢 Reference: {form_data['reference_number']}") - - if form_data.get('estimated_response_time'): - print(f" ⏰ Response Time: {form_data['estimated_response_time']}") - - submitted_data = form_data.get('submitted_data', {}) - if submitted_data: - print(f"\n📤 Submitted Data:") - for key, value in submitted_data.items(): - print(f" {key.title()}: {value}") - - # Save form results - with open("form_submission_results.json", "w", encoding="utf-8") as f: - json.dump(result["result"], f, indent=2) - print("\n💾 Form results saved to 'form_submission_results.json'") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -if __name__ == "__main__": - print("🔧 Comprehensive Agentic Scraper Examples") - print("=" * 60) - - # Check if API key is set - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("⚠️ Please set your SGAI_API_KEY environment variable before running!") - print("You can either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - exit(1) - - print("\n1. Basic Scraping (No AI Extraction)") - print("-" * 40) - example_basic_scraping_no_ai() - - print("\n\n2. AI Extraction Example - Dashboard Data") - print("-" * 40) - example_ai_extraction() - - print("\n\n3. E-commerce Product Scraping with AI") - print("-" * 40) - # Uncomment to run e-commerce example - # example_ecommerce_product_scraping() - - print("\n\n4. Form Filling and Confirmation Extraction") - print("-" * 40) - # Uncomment to run form filling example - # example_form_filling_and_data_extraction() - - print("\n✨ Examples completed!") - print("\nℹ️ Note: Some examples are commented out by default.") - print(" Uncomment them in the main section to run additional examples.") diff --git a/scrapegraph-py/examples/agenticscraper/sync/agenticscraper_example.py b/scrapegraph-py/examples/agenticscraper/sync/agenticscraper_example.py deleted file mode 100644 index ecf658d..0000000 --- a/scrapegraph-py/examples/agenticscraper/sync/agenticscraper_example.py +++ /dev/null @@ -1,86 +0,0 @@ -import os - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - -# Initialize the client with API key from environment variable -api_key = os.getenv("SGAI_API_KEY") -if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - exit(1) - -sgai_client = Client(api_key=api_key) - -print("🤖 Example 1: Basic Agentic Scraping (No AI Extraction)") -print("=" * 60) - -# AgenticScraper request - basic automated login example (no AI) -response = sgai_client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - use_session=True, - steps=[ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ], - ai_extraction=False # No AI extraction - just get raw content -) - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response.get('result', 'No result yet')}") -print(f"Status: {response.get('status', 'Unknown')}") - -print("\n\n🧠 Example 2: Agentic Scraping with AI Extraction") -print("=" * 60) - -# Define schema for AI extraction -output_schema = { - "dashboard_info": { - "type": "object", - "properties": { - "username": {"type": "string"}, - "email": {"type": "string"}, - "dashboard_sections": { - "type": "array", - "items": {"type": "string"} - }, - "credits_remaining": {"type": "number"} - }, - "required": ["username", "dashboard_sections"] - } -} - -# AgenticScraper request with AI extraction -ai_response = sgai_client.agenticscraper( - url="https://dashboard.scrapegraphai.com/", - use_session=True, - steps=[ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login", - "wait for dashboard to load completely" - ], - user_prompt="Extract user information, available dashboard sections, and remaining credits from the dashboard", - output_schema=output_schema, - ai_extraction=True -) - -# Print the AI extraction response -print(f"AI Request ID: {ai_response['request_id']}") -print(f"AI Result: {ai_response.get('result', 'No result yet')}") -print(f"AI Status: {ai_response.get('status', 'Unknown')}") -print(f"User Prompt: Extract user information, available dashboard sections, and remaining credits") -print(f"Schema Provided: {'Yes' if output_schema else 'No'}") - -sgai_client.close() diff --git a/scrapegraph-py/examples/crawl/async/async_crawl_example.py b/scrapegraph-py/examples/crawl/async/async_crawl_example.py deleted file mode 100644 index 2a4d3bd..0000000 --- a/scrapegraph-py/examples/crawl/async/async_crawl_example.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint using the async client. - -Requirements: -- Python 3.7+ -- scrapegraph-py -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -import json -import os -import time - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -async def main(): - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - # Simple schema for founders' information - schema = { - "type": "object", - "properties": { - "founders": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "title": {"type": "string"}, - "bio": {"type": "string"}, - "linkedin": {"type": "string"}, - "twitter": {"type": "string"}, - }, - }, - } - }, - } - - url = "https://scrapegraphai.com" - prompt = "extract the founders'infos" - - try: - # Initialize the async client - async with AsyncClient.from_env() as client: - # Start the crawl job - print(f"\nStarting crawl for: {url}") - start_time = time.time() - crawl_response = await client.crawl( - url=url, - prompt=prompt, - data_schema=schema, - cache_website=True, - depth=2, - max_pages=2, - same_domain_only=True, - sitemap=True, # Use sitemap for better page discovery - # batch_size is optional and will be excluded if not provided - ) - execution_time = time.time() - start_time - print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds") - print("\nCrawl job started. Response:") - print(json.dumps(crawl_response, indent=2)) - - # If the crawl is asynchronous and returns an ID, fetch the result - crawl_id = crawl_response.get("id") or crawl_response.get("task_id") - start_time = time.time() - if crawl_id: - print("\nPolling for crawl result...") - for _ in range(10): - await asyncio.sleep(5) - result = await client.get_crawl(crawl_id) - if result.get("status") == "success" and result.get("result"): - execution_time = time.time() - start_time - print( - f"GET /v1/crawl/{crawl_id} execution time: {execution_time:.2f} seconds" - ) - print("\nCrawl completed. Result:") - print(json.dumps(result["result"]["llm_result"], indent=2)) - break - elif result.get("status") == "failed": - print("\nCrawl failed. Result:") - print(json.dumps(result, indent=2)) - break - else: - print(f"Status: {result.get('status')}, waiting...") - else: - print("Crawl did not complete in time.") - else: - print("No crawl ID found in response. Synchronous result:") - print(json.dumps(crawl_response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/crawl/async/async_crawl_markdown_direct_api_example.py b/scrapegraph-py/examples/crawl/async/async_crawl_markdown_direct_api_example.py deleted file mode 100644 index 61df0a1..0000000 --- a/scrapegraph-py/examples/crawl/async/async_crawl_markdown_direct_api_example.py +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/env python3 -""" -Async example script demonstrating the ScrapeGraphAI Crawler markdown conversion mode. - -This example shows how to use the crawler in markdown conversion mode: -- Cost-effective markdown conversion (NO AI/LLM processing) -- 2 credits per page (80% savings compared to AI mode) -- Clean HTML to markdown conversion with metadata extraction - -Requirements: -- Python 3.7+ -- aiohttp -- python-dotenv -- A .env file with your API_KEY - -Example .env file: -API_KEY=your_api_key_here -""" - -import asyncio -import json -import os -from typing import Any, Dict - -import aiohttp -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - -# Configuration - API key from environment or fallback -API_KEY = os.getenv("TEST_API_KEY", "sgai-xxx") # Load from .env file -BASE_URL = os.getenv("BASE_URL", "http://localhost:8001") # Can be overridden via env - - -async def make_request(url: str, data: Dict[str, Any]) -> Dict[str, Any]: - """Make an HTTP request to the API.""" - headers = {"Content-Type": "application/json", "SGAI-APIKEY": API_KEY} - - async with aiohttp.ClientSession() as session: - async with session.post(url, json=data, headers=headers) as response: - return await response.json() - - -async def poll_result(task_id: str) -> Dict[str, Any]: - """Poll for the result of a crawl job with rate limit handling.""" - headers = {"SGAI-APIKEY": API_KEY} - url = f"{BASE_URL}/v1/crawl/{task_id}" - - async with aiohttp.ClientSession() as session: - async with session.get(url, headers=headers) as response: - if response.status == 429: - # Rate limited - return special status to handle in polling loop - return {"status": "rate_limited", "retry_after": 60} - return await response.json() - - -async def poll_with_backoff(task_id: str, max_attempts: int = 20) -> Dict[str, Any]: - """ - Poll for crawl results with intelligent backoff to avoid rate limits. - - Args: - task_id: The task ID to poll for - max_attempts: Maximum number of polling attempts - - Returns: - The final result or raises an exception on timeout/failure - """ - print("⏳ Starting to poll for results with rate-limit protection...") - - # Initial wait to give the job time to start processing - await asyncio.sleep(15) - - for attempt in range(max_attempts): - try: - result = await poll_result(task_id) - status = result.get("status") - - if status == "rate_limited": - wait_time = min( - 90, 30 + (attempt * 10) - ) # Exponential backoff for rate limits - print(f"⚠️ Rate limited! Waiting {wait_time}s before retry...") - await asyncio.sleep(wait_time) - continue - - elif status == "success": - return result - - elif status == "failed": - raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}") - - else: - # Calculate progressive wait time: start at 15s, increase gradually - base_wait = 15 - progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s - - print( - f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s..." - ) - await asyncio.sleep(progressive_wait) - - except Exception as e: - if "rate" in str(e).lower() or "429" in str(e): - wait_time = min(90, 45 + (attempt * 10)) - print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...") - await asyncio.sleep(wait_time) - continue - else: - print(f"❌ Error polling for results: {e}") - if attempt < max_attempts - 1: - await asyncio.sleep(20) # Wait before retry - continue - raise - - raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts") - - -async def markdown_crawling_example(): - """ - Markdown Conversion Mode (NO AI/LLM Used) - - This example demonstrates cost-effective crawling that converts pages to clean markdown - WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown. - """ - print("=" * 60) - print("ASYNC MARKDOWN CONVERSION MODE (NO AI/LLM)") - print("=" * 60) - print("Use case: Get clean markdown content without AI processing") - print("Cost: 2 credits per page (80% savings!)") - print("Features: Clean markdown conversion, metadata extraction") - print("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!") - print() - - # Markdown conversion request - NO AI/LLM processing - request_data = { - "url": "https://scrapegraphai.com/", - "extraction_mode": False, # FALSE = Markdown conversion mode (NO AI/LLM used) - "depth": 2, - "max_pages": 2, - "same_domain_only": True, - "sitemap": False, # Use sitemap for better coverage - # Note: No prompt needed when extraction_mode = False - } - - print(f"🌐 Target URL: {request_data['url']}") - print("🤖 AI Prompt: None (no AI processing)") - print(f"📊 Crawl Depth: {request_data['depth']}") - print(f"📄 Max Pages: {request_data['max_pages']}") - print(f"🗺️ Use Sitemap: {request_data['sitemap']}") - print("💡 Mode: Pure HTML to markdown conversion") - print() - - # Start the markdown conversion job - print("🚀 Starting markdown conversion job...") - response = await make_request(f"{BASE_URL}/v1/crawl", request_data) - task_id = response.get("task_id") - - if not task_id: - print("❌ Failed to start markdown conversion job") - return - - print(f"📋 Task ID: {task_id}") - print("⏳ Polling for results...") - print() - - # Poll for results with rate-limit protection - try: - result = await poll_with_backoff(task_id, max_attempts=20) - - print("✅ Markdown conversion completed successfully!") - print() - - result_data = result.get("result", {}) - pages = result_data.get("pages", []) - crawled_urls = result_data.get("crawled_urls", []) - credits_used = result_data.get("credits_used", 0) - pages_processed = result_data.get("pages_processed", 0) - - # Prepare JSON output - json_output = { - "conversion_results": { - "pages_processed": pages_processed, - "credits_used": credits_used, - "cost_per_page": ( - credits_used / pages_processed if pages_processed > 0 else 0 - ), - "crawled_urls": crawled_urls, - }, - "markdown_content": {"total_pages": len(pages), "pages": []}, - } - - # Add page details to JSON - for i, page in enumerate(pages): - metadata = page.get("metadata", {}) - page_data = { - "page_number": i + 1, - "url": page.get("url"), - "title": page.get("title"), - "metadata": { - "word_count": metadata.get("word_count", 0), - "headers": metadata.get("headers", []), - "links_count": metadata.get("links_count", 0), - }, - "markdown_content": page.get("markdown", ""), - } - json_output["markdown_content"]["pages"].append(page_data) - - # Print JSON output - print("📊 RESULTS IN JSON FORMAT:") - print("-" * 40) - print(json.dumps(json_output, indent=2, ensure_ascii=False)) - - except Exception as e: - print(f"❌ Markdown conversion failed: {str(e)}") - - -async def main(): - """Run the async markdown crawling example.""" - print("🌐 ScrapeGraphAI Async Crawler - Markdown Conversion Example") - print("Cost-effective HTML to Markdown conversion (NO AI/LLM)") - print("=" * 60) - - # Check if API key is set - if API_KEY == "sgai-xxx": - print("⚠️ Please set your API key in the .env file") - print(" Create a .env file with your API key:") - print(" API_KEY=your_api_key_here") - print() - print(" You can get your API key from: https://dashboard.scrapegraphai.com") - print() - print(" Example .env file:") - print(" API_KEY=sgai-your-actual-api-key-here") - print(" BASE_URL=https://api.scrapegraphai.com # Optional") - return - - print(f"🔑 Using API key: {API_KEY[:10]}...") - print(f"🌐 Base URL: {BASE_URL}") - print() - - # Run the single example - await markdown_crawling_example() # Markdown conversion mode (NO AI) - - print("\n" + "=" * 60) - print("🎉 Example completed!") - print("💡 This demonstrates async markdown conversion mode:") - print(" • Cost-effective: Only 2 credits per page") - print(" • No AI/LLM processing - pure HTML to markdown conversion") - print(" • Perfect for content archival and documentation") - print(" • 80% cheaper than AI extraction modes!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/crawl/async/async_crawl_markdown_example.py b/scrapegraph-py/examples/crawl/async/async_crawl_markdown_example.py deleted file mode 100644 index 767f197..0000000 --- a/scrapegraph-py/examples/crawl/async/async_crawl_markdown_example.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/env python3 -""" -Async example demonstrating the ScrapeGraphAI Crawler markdown conversion mode. - -This example shows how to use the async crawler in markdown conversion mode: -- Cost-effective markdown conversion (NO AI/LLM processing) -- 2 credits per page (80% savings compared to AI mode) -- Clean HTML to markdown conversion with metadata extraction - -Requirements: -- Python 3.7+ -- scrapegraph-py -- aiohttp (installed with scrapegraph-py) -- A valid API key - -Usage: - python async_crawl_markdown_example.py -""" - -import asyncio -import json -import os -from typing import Any, Dict - -from scrapegraph_py import AsyncClient - - -async def poll_for_result( - client: AsyncClient, crawl_id: str, max_attempts: int = 20 -) -> Dict[str, Any]: - """ - Poll for crawl results with intelligent backoff to avoid rate limits. - - Args: - client: The async ScrapeGraph client - crawl_id: The crawl ID to poll for - max_attempts: Maximum number of polling attempts - - Returns: - The final result or raises an exception on timeout/failure - """ - print("⏳ Starting to poll for results with rate-limit protection...") - - # Initial wait to give the job time to start processing - await asyncio.sleep(15) - - for attempt in range(max_attempts): - try: - result = await client.get_crawl(crawl_id) - status = result.get("status") - - if status == "success": - return result - elif status == "failed": - raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}") - else: - # Calculate progressive wait time: start at 15s, increase gradually - base_wait = 15 - progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s - - print( - f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s..." - ) - await asyncio.sleep(progressive_wait) - - except Exception as e: - if "rate" in str(e).lower() or "429" in str(e): - wait_time = min(90, 45 + (attempt * 10)) - print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...") - await asyncio.sleep(wait_time) - continue - else: - print(f"❌ Error polling for results: {e}") - if attempt < max_attempts - 1: - await asyncio.sleep(20) # Wait before retry - continue - raise - - raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts") - - -async def markdown_crawling_example(): - """ - Markdown Conversion Mode (NO AI/LLM Used) - - This example demonstrates cost-effective crawling that converts pages to clean markdown - WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown. - """ - print("=" * 60) - print("ASYNC MARKDOWN CONVERSION MODE (NO AI/LLM)") - print("=" * 60) - print("Use case: Get clean markdown content without AI processing") - print("Cost: 2 credits per page (80% savings!)") - print("Features: Clean markdown conversion, metadata extraction") - print("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!") - print() - - # Initialize the async client - client = AsyncClient.from_env() - - # Target URL for markdown conversion - url = "https://scrapegraphai.com/" - - print(f"🌐 Target URL: {url}") - print("🤖 AI Prompt: None (no AI processing)") - print("📊 Crawl Depth: 2") - print("📄 Max Pages: 2") - print("🗺️ Use Sitemap: True") - print("💡 Mode: Pure HTML to markdown conversion") - print() - - # Start the markdown conversion job - print("🚀 Starting markdown conversion job...") - - # Call crawl with extraction_mode=False for markdown conversion - response = await client.crawl( - url=url, - extraction_mode=False, # FALSE = Markdown conversion mode (NO AI/LLM used) - depth=2, - max_pages=2, - same_domain_only=True, - sitemap=True, # Use sitemap for better coverage - # Note: No prompt or data_schema needed when extraction_mode=False - ) - - crawl_id = response.get("crawl_id") or response.get("task_id") - - if not crawl_id: - print("❌ Failed to start markdown conversion job") - return - - print(f"📋 Crawl ID: {crawl_id}") - print("⏳ Polling for results...") - print() - - # Poll for results with rate-limit protection - try: - result = await poll_for_result(client, crawl_id, max_attempts=20) - - print("✅ Markdown conversion completed successfully!") - print() - - result_data = result.get("result", {}) - pages = result_data.get("pages", []) - crawled_urls = result_data.get("crawled_urls", []) - credits_used = result_data.get("credits_used", 0) - pages_processed = result_data.get("pages_processed", 0) - - # Prepare JSON output - json_output = { - "conversion_results": { - "pages_processed": pages_processed, - "credits_used": credits_used, - "cost_per_page": ( - credits_used / pages_processed if pages_processed > 0 else 0 - ), - "crawled_urls": crawled_urls, - }, - "markdown_content": {"total_pages": len(pages), "pages": []}, - } - - # Add page details to JSON - for i, page in enumerate(pages): - metadata = page.get("metadata", {}) - page_data = { - "page_number": i + 1, - "url": page.get("url"), - "title": page.get("title"), - "metadata": { - "word_count": metadata.get("word_count", 0), - "headers": metadata.get("headers", []), - "links_count": metadata.get("links_count", 0), - }, - "markdown_content": page.get("markdown", ""), - } - json_output["markdown_content"]["pages"].append(page_data) - - # Print JSON output - print("📊 RESULTS IN JSON FORMAT:") - print("-" * 40) - print(json.dumps(json_output, indent=2, ensure_ascii=False)) - - except Exception as e: - print(f"❌ Markdown conversion failed: {str(e)}") - - -async def main(): - """Run the async markdown crawling example.""" - print("🌐 ScrapeGraphAI Async Crawler - Markdown Conversion Example") - print("Cost-effective HTML to Markdown conversion (NO AI/LLM)") - print("=" * 60) - - # Check if API key is set - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("⚠️ Please set your API key in the environment variable SGAI_API_KEY") - print(" export SGAI_API_KEY=your_api_key_here") - print() - print(" You can get your API key from: https://dashboard.scrapegraphai.com") - return - - print(f"🔑 Using API key: {api_key[:10]}...") - print() - - # Run the markdown conversion example - await markdown_crawling_example() - - print("\n" + "=" * 60) - print("🎉 Example completed!") - print("💡 This demonstrates async markdown conversion mode:") - print(" • Cost-effective: Only 2 credits per page") - print(" • No AI/LLM processing - pure HTML to markdown conversion") - print(" • Perfect for content archival and documentation") - print(" • 80% cheaper than AI extraction modes!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/crawl/async/async_crawl_sitemap_example.py b/scrapegraph-py/examples/crawl/async/async_crawl_sitemap_example.py deleted file mode 100644 index 3e18b42..0000000 --- a/scrapegraph-py/examples/crawl/async/async_crawl_sitemap_example.py +++ /dev/null @@ -1,239 +0,0 @@ -#!/usr/bin/env python3 -""" -Async example demonstrating the ScrapeGraphAI Crawler with sitemap functionality. - -This example shows how to use the async crawler with sitemap enabled for better page discovery: -- Sitemap helps discover more pages efficiently -- Better coverage of website content -- More comprehensive crawling results - -Requirements: -- Python 3.7+ -- scrapegraph-py -- aiohttp (installed with scrapegraph-py) -- A valid API key - -Usage: - python async_crawl_sitemap_example.py -""" - -import asyncio -import json -import os -from typing import Any, Dict - -from scrapegraph_py import AsyncClient - - -async def poll_for_result( - client: AsyncClient, crawl_id: str, max_attempts: int = 20 -) -> Dict[str, Any]: - """ - Poll for crawl results with intelligent backoff to avoid rate limits. - - Args: - client: The async ScrapeGraph client - crawl_id: The crawl ID to poll for - max_attempts: Maximum number of polling attempts - - Returns: - The final result or raises an exception on timeout/failure - """ - print("⏳ Starting to poll for results with rate-limit protection...") - - # Initial wait to give the job time to start processing - await asyncio.sleep(15) - - for attempt in range(max_attempts): - try: - result = await client.get_crawl(crawl_id) - status = result.get("status") - - if status == "success": - return result - elif status == "failed": - raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}") - else: - # Calculate progressive wait time: start at 15s, increase gradually - base_wait = 15 - progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s - - print( - f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s..." - ) - await asyncio.sleep(progressive_wait) - - except Exception as e: - if "rate" in str(e).lower() or "429" in str(e): - wait_time = min(90, 45 + (attempt * 10)) - print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...") - await asyncio.sleep(wait_time) - continue - else: - print(f"❌ Error polling for results: {e}") - if attempt < max_attempts - 1: - await asyncio.sleep(20) # Wait before retry - continue - raise - - raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts") - - -async def sitemap_crawling_example(): - """ - Async Sitemap-enabled Crawling Example - - This example demonstrates how to use sitemap for better page discovery with async client. - Sitemap helps the crawler find more pages efficiently by using the website's sitemap.xml. - """ - print("=" * 60) - print("ASYNC SITEMAP-ENABLED CRAWLING EXAMPLE") - print("=" * 60) - print("Use case: Comprehensive website crawling with sitemap discovery") - print("Benefits: Better page coverage, more efficient crawling") - print("Features: Sitemap-based page discovery, structured data extraction") - print() - - # Initialize the async client - client = AsyncClient.from_env() - - # Target URL - using a website that likely has a sitemap - url = "https://www.giemmeagordo.com/risultati-ricerca-annunci/?sort=newest&search_city=&search_lat=null&search_lng=null&search_category=0&search_type=0&search_min_price=&search_max_price=&bagni=&bagni_comparison=equal&camere=&camere_comparison=equal" - - # Schema for real estate listings - schema = { - "type": "object", - "properties": { - "listings": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "price": {"type": "string"}, - "location": {"type": "string"}, - "description": {"type": "string"}, - "features": {"type": "array", "items": {"type": "string"}}, - "url": {"type": "string"}, - }, - }, - } - }, - } - - prompt = "Extract all real estate listings with their details including title, price, location, description, and features" - - print(f"🌐 Target URL: {url}") - print("🤖 AI Prompt: Extract real estate listings") - print("📊 Crawl Depth: 1") - print("📄 Max Pages: 10") - print("🗺️ Use Sitemap: True (enabled for better page discovery)") - print("🏠 Same Domain Only: True") - print("💾 Cache Website: True") - print("💡 Mode: AI extraction with sitemap discovery") - print() - - # Start the sitemap-enabled crawl job - print("🚀 Starting async sitemap-enabled crawl job...") - - # Call crawl with sitemap=True for better page discovery - response = await client.crawl( - url=url, - prompt=prompt, - data_schema=schema, - extraction_mode=True, # AI extraction mode - depth=1, - max_pages=10, - same_domain_only=True, - cache_website=True, - sitemap=True, # Enable sitemap for better page discovery - ) - - crawl_id = response.get("crawl_id") or response.get("task_id") - - if not crawl_id: - print("❌ Failed to start sitemap-enabled crawl job") - return - - print(f"📋 Crawl ID: {crawl_id}") - print("⏳ Polling for results...") - print() - - # Poll for results with rate-limit protection - try: - result = await poll_for_result(client, crawl_id, max_attempts=20) - - print("✅ Async sitemap-enabled crawl completed successfully!") - print() - - result_data = result.get("result", {}) - llm_result = result_data.get("llm_result", {}) - crawled_urls = result_data.get("crawled_urls", []) - credits_used = result_data.get("credits_used", 0) - pages_processed = result_data.get("pages_processed", 0) - - # Prepare JSON output - json_output = { - "crawl_results": { - "pages_processed": pages_processed, - "credits_used": credits_used, - "cost_per_page": ( - credits_used / pages_processed if pages_processed > 0 else 0 - ), - "crawled_urls": crawled_urls, - "sitemap_enabled": True, - }, - "extracted_data": llm_result, - } - - # Print JSON output - print("📊 RESULTS IN JSON FORMAT:") - print("-" * 40) - print(json.dumps(json_output, indent=2, ensure_ascii=False)) - - # Print summary - print("\n" + "=" * 60) - print("📈 CRAWL SUMMARY:") - print("=" * 60) - print(f"✅ Pages processed: {pages_processed}") - print(f"💰 Credits used: {credits_used}") - print(f"🔗 URLs crawled: {len(crawled_urls)}") - print(f"🗺️ Sitemap enabled: Yes") - print(f"📊 Data extracted: {len(llm_result.get('listings', []))} listings found") - - except Exception as e: - print(f"❌ Async sitemap-enabled crawl failed: {str(e)}") - - -async def main(): - """Run the async sitemap crawling example.""" - print("🌐 ScrapeGraphAI Async Crawler - Sitemap Example") - print("Comprehensive website crawling with sitemap discovery") - print("=" * 60) - - # Check if API key is set - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("⚠️ Please set your API key in the environment variable SGAI_API_KEY") - print(" export SGAI_API_KEY=your_api_key_here") - print() - print(" You can get your API key from: https://dashboard.scrapegraphai.com") - return - - print(f"🔑 Using API key: {api_key[:10]}...") - print() - - # Run the sitemap crawling example - await sitemap_crawling_example() - - print("\n" + "=" * 60) - print("🎉 Example completed!") - print("💡 This demonstrates async sitemap-enabled crawling:") - print(" • Better page discovery using sitemap.xml") - print(" • More comprehensive website coverage") - print(" • Efficient crawling of structured websites") - print(" • Perfect for e-commerce, news sites, and content-heavy websites") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/crawl/async/async_crawl_with_path_filtering_example.py b/scrapegraph-py/examples/crawl/async/async_crawl_with_path_filtering_example.py deleted file mode 100644 index b3650ad..0000000 --- a/scrapegraph-py/examples/crawl/async/async_crawl_with_path_filtering_example.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -Example of using the async crawl endpoint with path filtering. - -This example demonstrates how to use include_paths and exclude_paths -to control which pages are crawled on a website (async version). -""" -import asyncio -import os -from scrapegraph_py import AsyncClient -from pydantic import BaseModel, Field - - -# Define your output schema -class ProductInfo(BaseModel): - name: str = Field(description="Product name") - price: str = Field(description="Product price") - category: str = Field(description="Product category") - - -class CrawlResult(BaseModel): - products: list[ProductInfo] = Field(description="List of products found") - categories: list[str] = Field(description="List of product categories") - - -async def main(): - # Initialize the async client - sgai_api_key = os.getenv("SGAI_API_KEY") - - async with AsyncClient(api_key=sgai_api_key) as client: - print("🔍 Starting async crawl with path filtering...") - print("=" * 50) - - # Example: Crawl only product pages, excluding certain sections - print("\n📝 Crawling e-commerce site with smart path filtering") - print("-" * 50) - - result = await client.crawl( - url="https://example-shop.com", - prompt="Extract all products with their names, prices, and categories", - data_schema=CrawlResult.model_json_schema(), - extraction_mode=True, - depth=3, - max_pages=50, - sitemap=True, # Use sitemap for better coverage - include_paths=[ - "/products/**", # Include all product pages - "/categories/*", # Include category listings - "/collections/*" # Include collection pages - ], - exclude_paths=[ - "/products/out-of-stock/*", # Skip out-of-stock items - "/products/*/reviews", # Skip review pages - "/admin/**", # Skip admin pages - "/api/**", # Skip API endpoints - "/*.pdf" # Skip PDF files - ] - ) - - print(f"Task ID: {result.get('task_id')}") - print("\n✅ Async crawl job started successfully!") - - # You can then poll for results using get_crawl - task_id = result.get('task_id') - if task_id: - print(f"\n⏳ Polling for results (task: {task_id})...") - - # Poll every 5 seconds until complete - max_attempts = 60 # 5 minutes max - for attempt in range(max_attempts): - await asyncio.sleep(5) - status = await client.get_crawl(task_id) - - state = status.get('state', 'UNKNOWN') - print(f"Attempt {attempt + 1}: Status = {state}") - - if state == 'SUCCESS': - print("\n✨ Crawl completed successfully!") - result_data = status.get('result', {}) - print(f"Found {len(result_data.get('products', []))} products") - break - elif state in ['FAILURE', 'REVOKED']: - print(f"\n❌ Crawl failed with status: {state}") - break - else: - print("\n⏰ Timeout: Crawl took too long") - - print("\n" + "=" * 50) - print("💡 Tips for effective path filtering:") - print("=" * 50) - print("• Combine with sitemap=True for better page discovery") - print("• Use include_paths to focus on content-rich sections") - print("• Use exclude_paths to skip pages with duplicate content") - print("• Test your patterns on a small max_pages first") - print("• Remember: exclude_paths overrides include_paths") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/crawl/sync/basic_crawl_example.py b/scrapegraph-py/examples/crawl/sync/basic_crawl_example.py deleted file mode 100644 index 1f58d5d..0000000 --- a/scrapegraph-py/examples/crawl/sync/basic_crawl_example.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema. - -Requirements: -- Python 3.7+ -- scrapegraph-py -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY="your_sgai_api_key" -""" - -import json -import os -import time -from typing import Any, Dict - -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def main(): - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print('SGAI_API_KEY="your_sgai_api_key"') - return - - schema: Dict[str, Any] = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "ScrapeGraphAI Website Content", - "type": "object", - "properties": { - "company": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "description": {"type": "string"}, - "features": {"type": "array", "items": {"type": "string"}}, - "contact_email": {"type": "string", "format": "email"}, - "social_links": { - "type": "object", - "properties": { - "github": {"type": "string", "format": "uri"}, - "linkedin": {"type": "string", "format": "uri"}, - "twitter": {"type": "string", "format": "uri"}, - }, - "additionalProperties": False, - }, - }, - "required": ["name", "description"], - }, - "services": { - "type": "array", - "items": { - "type": "object", - "properties": { - "service_name": {"type": "string"}, - "description": {"type": "string"}, - "features": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["service_name", "description"], - }, - }, - "legal": { - "type": "object", - "properties": { - "privacy_policy": {"type": "string"}, - "terms_of_service": {"type": "string"}, - }, - "required": ["privacy_policy", "terms_of_service"], - }, - }, - "required": ["company", "services", "legal"], - } - - url = "https://scrapegraphai.com/" - prompt = ( - "What does the company do? and I need text content from there privacy and terms" - ) - - try: - client = Client.from_env() - print(f"\nStarting crawl for: {url}") - start_time = time.time() - crawl_response = client.crawl( - url=url, - prompt=prompt, - data_schema=schema, - cache_website=True, - depth=2, - max_pages=2, - same_domain_only=True, - sitemap=True, # Use sitemap for better page discovery - batch_size=1, - ) - execution_time = time.time() - start_time - print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds") - print("\nCrawl job started. Response:") - print(json.dumps(crawl_response, indent=2)) - - crawl_id = crawl_response.get("id") or crawl_response.get("task_id") - start_time = time.time() - if crawl_id: - print("\nPolling for crawl result...") - for _ in range(10): - time.sleep(5) - result = client.get_crawl(crawl_id) - if result.get("status") == "success" and result.get("result"): - execution_time = time.time() - start_time - print( - f"GET /v1/crawl/{crawl_id} execution time: {execution_time:.2f} seconds" - ) - print("\nCrawl completed. Result:") - print(json.dumps(result["result"]["llm_result"], indent=2)) - break - elif result.get("status") == "failed": - print("\nCrawl failed. Result:") - print(json.dumps(result, indent=2)) - break - else: - print(f"Status: {result.get('status')}, waiting...") - else: - print("Crawl did not complete in time.") - else: - print("No crawl ID found in response. Synchronous result:") - print(json.dumps(crawl_response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/crawl/sync/crawl_example.py b/scrapegraph-py/examples/crawl/sync/crawl_example.py deleted file mode 100644 index fa25639..0000000 --- a/scrapegraph-py/examples/crawl/sync/crawl_example.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint. - -Requirements: -- Python 3.7+ -- scrapegraph-py -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import json -import os -import time - -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def main(): - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - # Simple schema for founders' information - schema = { - "type": "object", - "properties": { - "founders": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "title": {"type": "string"}, - "bio": {"type": "string"}, - "linkedin": {"type": "string"}, - "twitter": {"type": "string"}, - }, - }, - } - }, - } - - url = "https://scrapegraphai.com" - prompt = "extract the founders'infos" - - try: - # Initialize the client - client = Client.from_env() - - # Start the crawl job - print(f"\nStarting crawl for: {url}") - start_time = time.time() - crawl_response = client.crawl( - url=url, - prompt=prompt, - data_schema=schema, - cache_website=True, - depth=2, - max_pages=2, - same_domain_only=True, - sitemap=True, # Use sitemap for better page discovery - # batch_size is optional and will be excluded if not provided - ) - execution_time = time.time() - start_time - print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds") - print("\nCrawl job started. Response:") - print(json.dumps(crawl_response, indent=2)) - - # If the crawl is asynchronous and returns an ID, fetch the result - crawl_id = crawl_response.get("id") or crawl_response.get("task_id") - start_time = time.time() - if crawl_id: - print("\nPolling for crawl result...") - # Increase timeout to 5 minutes (60 iterations × 5 seconds) - for i in range(60): - time.sleep(5) - result = client.get_crawl(crawl_id) - if result.get("status") == "success" and result.get("result"): - execution_time = time.time() - start_time - print( - f"GET /v1/crawl/{crawl_id} execution time: {execution_time:.2f} seconds" - ) - print("\nCrawl completed. Result:") - print(json.dumps(result["result"]["llm_result"], indent=2)) - break - elif result.get("status") == "failed": - print("\nCrawl failed. Result:") - print(json.dumps(result, indent=2)) - break - else: - elapsed_time = (i + 1) * 5 - print( - f"Status: {result.get('status')}, waiting... ({elapsed_time}s elapsed)" - ) - else: - print("Crawl did not complete within 5 minutes.") - else: - print("No crawl ID found in response. Synchronous result:") - print(json.dumps(crawl_response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/crawl/sync/crawl_markdown_direct_api_example.py b/scrapegraph-py/examples/crawl/sync/crawl_markdown_direct_api_example.py deleted file mode 100644 index 2f73ab4..0000000 --- a/scrapegraph-py/examples/crawl/sync/crawl_markdown_direct_api_example.py +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/env python3 -""" -Example script demonstrating the ScrapeGraphAI Crawler markdown conversion mode. - -This example shows how to use the crawler in markdown conversion mode: -- Cost-effective markdown conversion (NO AI/LLM processing) -- 2 credits per page (80% savings compared to AI mode) -- Clean HTML to markdown conversion with metadata extraction - -Requirements: -- Python 3.7+ -- requests -- python-dotenv -- A .env file with your API_KEY - -Example .env file: -API_KEY=your_api_key_here -""" - -import json -import os -import time -from typing import Any, Dict - -import requests -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - -# Configuration - API key from environment or fallback -API_KEY = os.getenv("TEST_API_KEY", "sgai-xxx") # Load from .env file -BASE_URL = os.getenv("BASE_URL", "http://localhost:8001") # Can be overridden via env - - -def make_request(url: str, data: Dict[str, Any]) -> Dict[str, Any]: - """Make an HTTP request to the API.""" - headers = {"Content-Type": "application/json", "SGAI-APIKEY": API_KEY} - - response = requests.post(url, json=data, headers=headers) - return response.json() - - -def poll_result(task_id: str) -> Dict[str, Any]: - """Poll for the result of a crawl job with rate limit handling.""" - headers = {"SGAI-APIKEY": API_KEY} - url = f"{BASE_URL}/v1/crawl/{task_id}" - - response = requests.get(url, headers=headers) - - if response.status_code == 429: - # Rate limited - return special status to handle in polling loop - return {"status": "rate_limited", "retry_after": 60} - - return response.json() - - -def poll_with_backoff(task_id: str, max_attempts: int = 20) -> Dict[str, Any]: - """ - Poll for crawl results with intelligent backoff to avoid rate limits. - - Args: - task_id: The task ID to poll for - max_attempts: Maximum number of polling attempts - - Returns: - The final result or raises an exception on timeout/failure - """ - print("⏳ Starting to poll for results with rate-limit protection...") - - # Initial wait to give the job time to start processing - time.sleep(15) - - for attempt in range(max_attempts): - try: - result = poll_result(task_id) - status = result.get("status") - - if status == "rate_limited": - wait_time = min( - 90, 30 + (attempt * 10) - ) # Exponential backoff for rate limits - print(f"⚠️ Rate limited! Waiting {wait_time}s before retry...") - time.sleep(wait_time) - continue - - elif status == "success": - return result - - elif status == "failed": - raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}") - - else: - # Calculate progressive wait time: start at 15s, increase gradually - base_wait = 15 - progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s - - print( - f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s..." - ) - time.sleep(progressive_wait) - - except Exception as e: - if "rate" in str(e).lower() or "429" in str(e): - wait_time = min(90, 45 + (attempt * 10)) - print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...") - time.sleep(wait_time) - continue - else: - print(f"❌ Error polling for results: {e}") - if attempt < max_attempts - 1: - time.sleep(20) # Wait before retry - continue - raise - - raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts") - - -def markdown_crawling_example(): - """ - Markdown Conversion Mode (NO AI/LLM Used) - - This example demonstrates cost-effective crawling that converts pages to clean markdown - WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown. - """ - print("=" * 60) - print("MARKDOWN CONVERSION MODE (NO AI/LLM)") - print("=" * 60) - print("Use case: Get clean markdown content without AI processing") - print("Cost: 2 credits per page (80% savings!)") - print("Features: Clean markdown conversion, metadata extraction") - print("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!") - print() - - # Markdown conversion request - NO AI/LLM processing - request_data = { - "url": "https://scrapegraphai.com/", - "extraction_mode": False, # FALSE = Markdown conversion mode (NO AI/LLM used) - "depth": 2, - "max_pages": 2, - "same_domain_only": True, - "sitemap": False, # Use sitemap for better coverage - # Note: No prompt needed when extraction_mode = False - } - - print(f"🌐 Target URL: {request_data['url']}") - print("🤖 AI Prompt: None (no AI processing)") - print(f"📊 Crawl Depth: {request_data['depth']}") - print(f"📄 Max Pages: {request_data['max_pages']}") - print(f"🗺️ Use Sitemap: {request_data['sitemap']}") - print("💡 Mode: Pure HTML to markdown conversion") - print() - - # Start the markdown conversion job - print("🚀 Starting markdown conversion job...") - response = make_request(f"{BASE_URL}/v1/crawl", request_data) - task_id = response.get("task_id") - - if not task_id: - print("❌ Failed to start markdown conversion job") - return - - print(f"📋 Task ID: {task_id}") - print("⏳ Polling for results...") - print() - - # Poll for results with rate-limit protection - try: - result = poll_with_backoff(task_id, max_attempts=20) - - print("✅ Markdown conversion completed successfully!") - print() - - result_data = result.get("result", {}) - pages = result_data.get("pages", []) - crawled_urls = result_data.get("crawled_urls", []) - credits_used = result_data.get("credits_used", 0) - pages_processed = result_data.get("pages_processed", 0) - - # Prepare JSON output - json_output = { - "conversion_results": { - "pages_processed": pages_processed, - "credits_used": credits_used, - "cost_per_page": ( - credits_used / pages_processed if pages_processed > 0 else 0 - ), - "crawled_urls": crawled_urls, - }, - "markdown_content": {"total_pages": len(pages), "pages": []}, - } - - # Add page details to JSON - for i, page in enumerate(pages): - metadata = page.get("metadata", {}) - page_data = { - "page_number": i + 1, - "url": page.get("url"), - "title": page.get("title"), - "metadata": { - "word_count": metadata.get("word_count", 0), - "headers": metadata.get("headers", []), - "links_count": metadata.get("links_count", 0), - }, - "markdown_content": page.get("markdown", ""), - } - json_output["markdown_content"]["pages"].append(page_data) - - # Print JSON output - print("📊 RESULTS IN JSON FORMAT:") - print("-" * 40) - print(json.dumps(json_output, indent=2, ensure_ascii=False)) - - except Exception as e: - print(f"❌ Markdown conversion failed: {str(e)}") - - -def main(): - """Run the markdown crawling example.""" - print("🌐 ScrapeGraphAI Crawler - Markdown Conversion Example") - print("Cost-effective HTML to Markdown conversion (NO AI/LLM)") - print("=" * 60) - - # Check if API key is set - if API_KEY == "sgai-xxx": - print("⚠️ Please set your API key in the .env file") - print(" Create a .env file with your API key:") - print(" API_KEY=your_api_key_here") - print() - print(" You can get your API key from: https://dashboard.scrapegraphai.com") - print() - print(" Example .env file:") - print(" API_KEY=sgai-your-actual-api-key-here") - print(" BASE_URL=https://api.scrapegraphai.com # Optional") - return - - print(f"🔑 Using API key: {API_KEY[:10]}...") - print(f"🌐 Base URL: {BASE_URL}") - print() - - # Run the single example - markdown_crawling_example() # Markdown conversion mode (NO AI) - - print("\n" + "=" * 60) - print("🎉 Example completed!") - print("💡 This demonstrates markdown conversion mode:") - print(" • Cost-effective: Only 2 credits per page") - print(" • No AI/LLM processing - pure HTML to markdown conversion") - print(" • Perfect for content archival and documentation") - print(" • 80% cheaper than AI extraction modes!") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/crawl/sync/crawl_markdown_example.py b/scrapegraph-py/examples/crawl/sync/crawl_markdown_example.py deleted file mode 100644 index 01c682b..0000000 --- a/scrapegraph-py/examples/crawl/sync/crawl_markdown_example.py +++ /dev/null @@ -1,226 +0,0 @@ -#!/usr/bin/env python3 -""" -Example demonstrating the ScrapeGraphAI Crawler markdown conversion mode. - -This example shows how to use the crawler in markdown conversion mode: -- Cost-effective markdown conversion (NO AI/LLM processing) -- 2 credits per page (80% savings compared to AI mode) -- Clean HTML to markdown conversion with metadata extraction - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- A valid API key (set in .env file as SGAI_API_KEY=your_key or environment variable) - -Usage: - python crawl_markdown_example.py -""" - -import json -import os -import time -from typing import Any, Dict - -from dotenv import load_dotenv - -from scrapegraph_py import Client - - -def poll_for_result( - client: Client, crawl_id: str, max_attempts: int = 20 -) -> Dict[str, Any]: - """ - Poll for crawl results with intelligent backoff to avoid rate limits. - - Args: - client: The ScrapeGraph client - crawl_id: The crawl ID to poll for - max_attempts: Maximum number of polling attempts - - Returns: - The final result or raises an exception on timeout/failure - """ - print("⏳ Starting to poll for results with rate-limit protection...") - - # Initial wait to give the job time to start processing - time.sleep(15) - - for attempt in range(max_attempts): - try: - result = client.get_crawl(crawl_id) - status = result.get("status") - - if status == "success": - return result - elif status == "failed": - raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}") - else: - # Calculate progressive wait time: start at 15s, increase gradually - base_wait = 15 - progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s - - print( - f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s..." - ) - time.sleep(progressive_wait) - - except Exception as e: - if "rate" in str(e).lower() or "429" in str(e): - wait_time = min(90, 45 + (attempt * 10)) - print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...") - time.sleep(wait_time) - continue - else: - print(f"❌ Error polling for results: {e}") - if attempt < max_attempts - 1: - time.sleep(20) # Wait before retry - continue - raise - - raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts") - - -def markdown_crawling_example(): - """ - Markdown Conversion Mode (NO AI/LLM Used) - - This example demonstrates cost-effective crawling that converts pages to clean markdown - WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown. - """ - print("=" * 60) - print("MARKDOWN CONVERSION MODE (NO AI/LLM)") - print("=" * 60) - print("Use case: Get clean markdown content without AI processing") - print("Cost: 2 credits per page (80% savings!)") - print("Features: Clean markdown conversion, metadata extraction") - print("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!") - print() - - # Initialize the client - client = Client.from_env() - - # Target URL for markdown conversion - url = "https://scrapegraphai.com/" - - print(f"🌐 Target URL: {url}") - print("🤖 AI Prompt: None (no AI processing)") - print("📊 Crawl Depth: 2") - print("📄 Max Pages: 2") - print("🗺️ Use Sitemap: True") - print("💡 Mode: Pure HTML to markdown conversion") - print() - - # Start the markdown conversion job - print("🚀 Starting markdown conversion job...") - - # Call crawl with extraction_mode=False for markdown conversion - response = client.crawl( - url=url, - extraction_mode=False, # FALSE = Markdown conversion mode (NO AI/LLM used) - depth=2, - max_pages=2, - same_domain_only=True, - sitemap=True, # Use sitemap for better coverage - # Note: No prompt or data_schema needed when extraction_mode=False - ) - - crawl_id = response.get("crawl_id") or response.get("task_id") - - if not crawl_id: - print("❌ Failed to start markdown conversion job") - return - - print(f"📋 Crawl ID: {crawl_id}") - print("⏳ Polling for results...") - print() - - # Poll for results with rate-limit protection - try: - result = poll_for_result(client, crawl_id, max_attempts=20) - - print("✅ Markdown conversion completed successfully!") - print() - - result_data = result.get("result", {}) - pages = result_data.get("pages", []) - crawled_urls = result_data.get("crawled_urls", []) - credits_used = result_data.get("credits_used", 0) - pages_processed = result_data.get("pages_processed", 0) - - # Prepare JSON output - json_output = { - "conversion_results": { - "pages_processed": pages_processed, - "credits_used": credits_used, - "cost_per_page": ( - credits_used / pages_processed if pages_processed > 0 else 0 - ), - "crawled_urls": crawled_urls, - }, - "markdown_content": {"total_pages": len(pages), "pages": []}, - } - - # Add page details to JSON - for i, page in enumerate(pages): - metadata = page.get("metadata", {}) - page_data = { - "page_number": i + 1, - "url": page.get("url"), - "title": page.get("title"), - "metadata": { - "word_count": metadata.get("word_count", 0), - "headers": metadata.get("headers", []), - "links_count": metadata.get("links_count", 0), - }, - "markdown_content": page.get("markdown", ""), - } - json_output["markdown_content"]["pages"].append(page_data) - - # Print JSON output - print("📊 RESULTS IN JSON FORMAT:") - print("-" * 40) - print(json.dumps(json_output, indent=2, ensure_ascii=False)) - - except Exception as e: - print(f"❌ Markdown conversion failed: {str(e)}") - - -def main(): - """Run the markdown crawling example.""" - print("🌐 ScrapeGraphAI Crawler - Markdown Conversion Example") - print("Cost-effective HTML to Markdown conversion (NO AI/LLM)") - print("=" * 60) - - # Load environment variables from .env file - load_dotenv() - - # Check if API key is set - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("⚠️ Please set your API key in the environment variable SGAI_API_KEY") - print(" Option 1: Create a .env file with: SGAI_API_KEY=your_api_key_here") - print( - " Option 2: Set environment variable: export SGAI_API_KEY=your_api_key_here" - ) - print() - print(" You can get your API key from: https://dashboard.scrapegraphai.com") - return - - print(f"🔑 Using API key: {api_key[:10]}...") - print() - - # Run the markdown conversion example - markdown_crawling_example() - - print("\n" + "=" * 60) - print("🎉 Example completed!") - print("💡 This demonstrates markdown conversion mode:") - print(" • Cost-effective: Only 2 credits per page") - print(" • No AI/LLM processing - pure HTML to markdown conversion") - print(" • Perfect for content archival and documentation") - print(" • 80% cheaper than AI extraction modes!") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/crawl/sync/crawl_sitemap_example.py b/scrapegraph-py/examples/crawl/sync/crawl_sitemap_example.py deleted file mode 100644 index 1fb9a69..0000000 --- a/scrapegraph-py/examples/crawl/sync/crawl_sitemap_example.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 -""" -Example demonstrating the ScrapeGraphAI Crawler with sitemap functionality. - -This example shows how to use the crawler with sitemap enabled for better page discovery: -- Sitemap helps discover more pages efficiently -- Better coverage of website content -- More comprehensive crawling results - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- A valid API key (set in .env file as SGAI_API_KEY=your_key or environment variable) - -Usage: - python crawl_sitemap_example.py -""" - -import json -import os -import time -from typing import Any, Dict - -from dotenv import load_dotenv - -from scrapegraph_py import Client - - -def poll_for_result( - client: Client, crawl_id: str, max_attempts: int = 20 -) -> Dict[str, Any]: - """ - Poll for crawl results with intelligent backoff to avoid rate limits. - - Args: - client: The ScrapeGraph client - crawl_id: The crawl ID to poll for - max_attempts: Maximum number of polling attempts - - Returns: - The final result or raises an exception on timeout/failure - """ - print("⏳ Starting to poll for results with rate-limit protection...") - - # Initial wait to give the job time to start processing - time.sleep(15) - - for attempt in range(max_attempts): - try: - result = client.get_crawl(crawl_id) - status = result.get("status") - - if status == "success": - return result - elif status == "failed": - raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}") - else: - # Calculate progressive wait time: start at 15s, increase gradually - base_wait = 15 - progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s - - print( - f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s..." - ) - time.sleep(progressive_wait) - - except Exception as e: - if "rate" in str(e).lower() or "429" in str(e): - wait_time = min(90, 45 + (attempt * 10)) - print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...") - time.sleep(wait_time) - continue - else: - print(f"❌ Error polling for results: {e}") - if attempt < max_attempts - 1: - time.sleep(20) # Wait before retry - continue - raise - - raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts") - - -def sitemap_crawling_example(): - """ - Sitemap-enabled Crawling Example - - This example demonstrates how to use sitemap for better page discovery. - Sitemap helps the crawler find more pages efficiently by using the website's sitemap.xml. - """ - print("=" * 60) - print("SITEMAP-ENABLED CRAWLING EXAMPLE") - print("=" * 60) - print("Use case: Comprehensive website crawling with sitemap discovery") - print("Benefits: Better page coverage, more efficient crawling") - print("Features: Sitemap-based page discovery, structured data extraction") - print() - - # Initialize the client - client = Client.from_env() - - # Target URL - using a website that likely has a sitemap - url = "https://www.giemmeagordo.com/risultati-ricerca-annunci/?sort=newest&search_city=&search_lat=null&search_lng=null&search_category=0&search_type=0&search_min_price=&search_max_price=&bagni=&bagni_comparison=equal&camere=&camere_comparison=equal" - - # Schema for real estate listings - schema = { - "type": "object", - "properties": { - "listings": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "price": {"type": "string"}, - "location": {"type": "string"}, - "description": {"type": "string"}, - "features": {"type": "array", "items": {"type": "string"}}, - "url": {"type": "string"}, - }, - }, - } - }, - } - - prompt = "Extract all real estate listings with their details including title, price, location, description, and features" - - print(f"🌐 Target URL: {url}") - print("🤖 AI Prompt: Extract real estate listings") - print("📊 Crawl Depth: 1") - print("📄 Max Pages: 10") - print("🗺️ Use Sitemap: True (enabled for better page discovery)") - print("🏠 Same Domain Only: True") - print("💾 Cache Website: True") - print("💡 Mode: AI extraction with sitemap discovery") - print() - - # Start the sitemap-enabled crawl job - print("🚀 Starting sitemap-enabled crawl job...") - - # Call crawl with sitemap=True for better page discovery - response = client.crawl( - url=url, - prompt=prompt, - data_schema=schema, - extraction_mode=True, # AI extraction mode - depth=1, - max_pages=10, - same_domain_only=True, - cache_website=True, - sitemap=True, # Enable sitemap for better page discovery - ) - - crawl_id = response.get("crawl_id") or response.get("task_id") - - if not crawl_id: - print("❌ Failed to start sitemap-enabled crawl job") - return - - print(f"📋 Crawl ID: {crawl_id}") - print("⏳ Polling for results...") - print() - - # Poll for results with rate-limit protection - try: - result = poll_for_result(client, crawl_id, max_attempts=20) - - print("✅ Sitemap-enabled crawl completed successfully!") - print() - - result_data = result.get("result", {}) - llm_result = result_data.get("llm_result", {}) - crawled_urls = result_data.get("crawled_urls", []) - credits_used = result_data.get("credits_used", 0) - pages_processed = result_data.get("pages_processed", 0) - - # Prepare JSON output - json_output = { - "crawl_results": { - "pages_processed": pages_processed, - "credits_used": credits_used, - "cost_per_page": ( - credits_used / pages_processed if pages_processed > 0 else 0 - ), - "crawled_urls": crawled_urls, - "sitemap_enabled": True, - }, - "extracted_data": llm_result, - } - - # Print JSON output - print("📊 RESULTS IN JSON FORMAT:") - print("-" * 40) - print(json.dumps(json_output, indent=2, ensure_ascii=False)) - - # Print summary - print("\n" + "=" * 60) - print("📈 CRAWL SUMMARY:") - print("=" * 60) - print(f"✅ Pages processed: {pages_processed}") - print(f"💰 Credits used: {credits_used}") - print(f"🔗 URLs crawled: {len(crawled_urls)}") - print(f"🗺️ Sitemap enabled: Yes") - print(f"📊 Data extracted: {len(llm_result.get('listings', []))} listings found") - - except Exception as e: - print(f"❌ Sitemap-enabled crawl failed: {str(e)}") - - -def main(): - """Run the sitemap crawling example.""" - print("🌐 ScrapeGraphAI Crawler - Sitemap Example") - print("Comprehensive website crawling with sitemap discovery") - print("=" * 60) - - # Load environment variables from .env file - load_dotenv() - - # Check if API key is set - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("⚠️ Please set your API key in the environment variable SGAI_API_KEY") - print(" Option 1: Create a .env file with: SGAI_API_KEY=your_api_key_here") - print( - " Option 2: Set environment variable: export SGAI_API_KEY=your_api_key_here" - ) - print() - print(" You can get your API key from: https://dashboard.scrapegraphai.com") - return - - print(f"🔑 Using API key: {api_key[:10]}...") - print() - - # Run the sitemap crawling example - sitemap_crawling_example() - - print("\n" + "=" * 60) - print("🎉 Example completed!") - print("💡 This demonstrates sitemap-enabled crawling:") - print(" • Better page discovery using sitemap.xml") - print(" • More comprehensive website coverage") - print(" • Efficient crawling of structured websites") - print(" • Perfect for e-commerce, news sites, and content-heavy websites") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/crawl/sync/crawl_with_path_filtering_example.py b/scrapegraph-py/examples/crawl/sync/crawl_with_path_filtering_example.py deleted file mode 100644 index ba9530e..0000000 --- a/scrapegraph-py/examples/crawl/sync/crawl_with_path_filtering_example.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Example of using the crawl endpoint with path filtering. - -This example demonstrates how to use include_paths and exclude_paths -to control which pages are crawled on a website. -""" -import os -from scrapegraph_py import Client -from pydantic import BaseModel, Field - - -# Define your output schema -class ProductInfo(BaseModel): - name: str = Field(description="Product name") - price: str = Field(description="Product price") - description: str = Field(description="Product description") - - -class CrawlResult(BaseModel): - products: list[ProductInfo] = Field(description="List of products found") - total_products: int = Field(description="Total number of products") - - -def main(): - # Initialize the client - sgai_api_key = os.getenv("SGAI_API_KEY") - client = Client(api_key=sgai_api_key) - - print("🔍 Starting crawl with path filtering...") - print("=" * 50) - - # Example 1: Include only specific paths - print("\n📝 Example 1: Crawl only /products/* pages") - print("-" * 50) - - result = client.crawl( - url="https://example.com", - prompt="Extract product information including name, price, and description", - data_schema=CrawlResult.model_json_schema(), - extraction_mode=True, - depth=2, - max_pages=10, - include_paths=["/products/*", "/items/*"], # Only crawl product pages - exclude_paths=["/products/archived/*"] # But skip archived products - ) - - print(f"Task ID: {result.get('task_id')}") - print("\n✅ Crawl job started successfully!") - - # Example 2: Exclude admin and API paths - print("\n📝 Example 2: Crawl all pages except admin and API") - print("-" * 50) - - result = client.crawl( - url="https://example.com", - prompt="Extract all relevant information", - data_schema=CrawlResult.model_json_schema(), - extraction_mode=True, - depth=2, - max_pages=20, - exclude_paths=[ - "/admin/*", # Skip all admin pages - "/api/*", # Skip all API endpoints - "/private/*", # Skip private pages - "/*.json" # Skip JSON files - ] - ) - - print(f"Task ID: {result.get('task_id')}") - print("\n✅ Crawl job started successfully!") - - # Example 3: Complex filtering with wildcards - print("\n📝 Example 3: Complex path filtering with wildcards") - print("-" * 50) - - result = client.crawl( - url="https://example.com", - prompt="Extract blog content and metadata", - data_schema=CrawlResult.model_json_schema(), - extraction_mode=True, - depth=3, - max_pages=15, - include_paths=[ - "/blog/**", # Include all blog pages (any depth) - "/articles/*", # Include top-level articles - "/news/2024/*" # Include 2024 news only - ], - exclude_paths=[ - "/blog/draft/*", # Skip draft blog posts - "/blog/*/comments" # Skip comment pages - ] - ) - - print(f"Task ID: {result.get('task_id')}") - print("\n✅ Crawl job started successfully!") - - print("\n" + "=" * 50) - print("📚 Path Filtering Guide:") - print("=" * 50) - print("• Use '/*' to match a single path segment") - print(" Example: '/products/*' matches '/products/item1' but not '/products/cat/item1'") - print("\n• Use '/**' to match any number of path segments") - print(" Example: '/blog/**' matches '/blog/2024/post' and '/blog/category/2024/post'") - print("\n• exclude_paths takes precedence over include_paths") - print(" You can include a broad pattern and exclude specific subsets") - print("\n• Paths must start with '/'") - print(" Example: '/products/*' is valid, 'products/*' is not") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/markdownify/async/async_markdownify_example.py b/scrapegraph-py/examples/markdownify/async/async_markdownify_example.py deleted file mode 100644 index 129a690..0000000 --- a/scrapegraph-py/examples/markdownify/async/async_markdownify_example.py +++ /dev/null @@ -1,37 +0,0 @@ -import asyncio - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - - -async def main(): - # Initialize async client - sgai_client = AsyncClient(api_key="your-api-key-here") - - # Concurrent markdownify requests - urls = [ - "https://scrapegraphai.com/", - "https://github.com/ScrapeGraphAI/Scrapegraph-ai", - ] - - tasks = [sgai_client.markdownify(website_url=url) for url in urls] - - # Execute requests concurrently - responses = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for i, response in enumerate(responses): - if isinstance(response, Exception): - print(f"\nError for {urls[i]}: {response}") - else: - print(f"\nPage {i+1} Markdown:") - print(f"URL: {urls[i]}") - print(f"Result: {response['result']}") - - await sgai_client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/markdownify/sync/markdownify_example.py b/scrapegraph-py/examples/markdownify/sync/markdownify_example.py deleted file mode 100644 index 90d6bcb..0000000 --- a/scrapegraph-py/examples/markdownify/sync/markdownify_example.py +++ /dev/null @@ -1,16 +0,0 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - -# Initialize the client -sgai_client = Client(api_key="your-api-key-here") - -# Markdownify request -response = sgai_client.markdownify( - website_url="https://example.com", -) - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") diff --git a/scrapegraph-py/examples/markdownify/sync/markdownify_movements_example.py b/scrapegraph-py/examples/markdownify/sync/markdownify_movements_example.py deleted file mode 100644 index 8f8e9ba..0000000 --- a/scrapegraph-py/examples/markdownify/sync/markdownify_movements_example.py +++ /dev/null @@ -1,314 +0,0 @@ -#!/usr/bin/env python3 -""" -Example demonstrating how to use the Markdownify API with enhanced features. - -This example shows how to: -1. Set up the API request for markdownify with custom headers -2. Make the API call to convert a website to markdown -3. Handle the response and save the markdown content -4. Display comprehensive results with statistics and timing - -Note: Unlike Smart Scraper, Markdownify doesn't support interactive movements/steps. -It focuses on converting websites to clean markdown format. - -Requirements: -- Python 3.7+ -- requests -- python-dotenv -- A .env file with your TEST_API_KEY - -Example .env file: -TEST_API_KEY=your_api_key_here -""" - -import os -import time - -import requests -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - - -def markdownify_movements(): - """ - Enhanced markdownify function with comprehensive features and timing. - - Note: Markdownify doesn't support interactive movements like Smart Scraper. - Instead, it excels at converting websites to clean markdown format. - """ - # Get API key from .env file - api_key = os.getenv("TEST_API_KEY") - if not api_key: - raise ValueError( - "API key must be provided or set in .env file as TEST_API_KEY. " - "Create a .env file with: TEST_API_KEY=your_api_key_here" - ) - steps = [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search", - ] - # Target website configuration - website_url = "https://scrapegraphai.com/" - - # Enhanced headers for better scraping (similar to interactive movements) - custom_headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - } - - # Prepare API request headers - headers = { - "SGAI-APIKEY": api_key, - "Content-Type": "application/json", - } - - # Request body for markdownify - body = { - "website_url": website_url, - "headers": custom_headers, - "steps": steps, - } - - print("🚀 Starting Markdownify with Enhanced Features...") - print(f"🌐 Website URL: {website_url}") - print(f"📋 Custom Headers: {len(custom_headers)} headers configured") - print("🎯 Goal: Convert website to clean markdown format") - print("\n" + "=" * 60) - - # Start timer - start_time = time.time() - print( - f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" - ) - print("🔄 Processing markdown conversion...") - - try: - response = requests.post( - "http://localhost:8001/v1/markdownify", - json=body, - headers=headers, - ) - - # Calculate execution time - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print( - f"📊 Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for markdown conversion" - ) - - if response.status_code == 200: - result = response.json() - markdown_content = result.get("result", "") - - print("✅ Request completed successfully!") - print(f"📊 Request ID: {result.get('request_id', 'N/A')}") - print(f"🔄 Status: {result.get('status', 'N/A')}") - print(f"📝 Content Length: {len(markdown_content)} characters") - - if result.get("error"): - print(f"❌ Error: {result['error']}") - else: - print("\n📋 MARKDOWN CONVERSION RESULTS:") - print("=" * 60) - - # Display markdown statistics - lines = markdown_content.split("\n") - words = len(markdown_content.split()) - - print("📊 Statistics:") - print(f" - Total Lines: {len(lines)}") - print(f" - Total Words: {words}") - print(f" - Total Characters: {len(markdown_content)}") - print( - f" - Processing Speed: {len(markdown_content)/execution_time:.0f} chars/second" - ) - - # Display first 500 characters - print("\n🔍 First 500 characters:") - print("-" * 50) - print(markdown_content[:500]) - if len(markdown_content) > 500: - print("...") - print("-" * 50) - - # Save to file - filename = f"markdownify_output_{int(time.time())}.md" - save_markdown_to_file(markdown_content, filename) - - # Display content analysis - analyze_markdown_content(markdown_content) - - else: - print(f"❌ Request failed with status code: {response.status_code}") - print(f"Response: {response.text}") - - except requests.exceptions.RequestException as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"🌐 Network error: {str(e)}") - except Exception as e: - end_time = time.time() - execution_time = end_time - start_time - execution_minutes = execution_time / 60 - print( - f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" - ) - print( - f"⚡ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" - ) - print(f"💥 Unexpected error: {str(e)}") - - -def save_markdown_to_file(markdown_content: str, filename: str): - """ - Save markdown content to a file with enhanced error handling. - - Args: - markdown_content: The markdown content to save - filename: The name of the file to save to - """ - try: - with open(filename, "w", encoding="utf-8") as f: - f.write(markdown_content) - print(f"💾 Markdown saved to: {filename}") - except Exception as e: - print(f"❌ Error saving file: {str(e)}") - - -def analyze_markdown_content(markdown_content: str): - """ - Analyze the markdown content and provide insights. - - Args: - markdown_content: The markdown content to analyze - """ - print("\n🔍 CONTENT ANALYSIS:") - print("-" * 50) - - # Count different markdown elements - lines = markdown_content.split("\n") - headers = [line for line in lines if line.strip().startswith("#")] - links = [line for line in lines if "[" in line and "](" in line] - code_blocks = markdown_content.count("```") - - print(f"📑 Headers found: {len(headers)}") - print(f"🔗 Links found: {len(links)}") - print( - f"💻 Code blocks: {code_blocks // 2}" - ) # Divide by 2 since each block has opening and closing - - # Show first few headers if they exist - if headers: - print("\n📋 First few headers:") - for i, header in enumerate(headers[:3]): - print(f" {i+1}. {header.strip()}") - if len(headers) > 3: - print(f" ... and {len(headers) - 3} more") - - -def show_curl_equivalent(): - """Show the equivalent curl command for reference""" - - # Load environment variables from .env file - load_dotenv() - - api_key = os.getenv("TEST_API_KEY", "your-api-key-here") - curl_command = f""" -curl --location 'http://localhost:8001/v1/markdownify' \\ ---header 'SGAI-APIKEY: {api_key}' \\ ---header 'Content-Type: application/json' \\ ---data '{{ - "website_url": "https://scrapegraphai.com/", - "headers": {{ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1" - }}, - "steps": [ - "click on search bar", - "wait for 500ms", - "fill email input box with mdehsan873@gmail.com", - "wait a sec", - "click on the first time of search result", - "wait for 2 seconds to load the result of search" - ] -}}' - """ - - print("Equivalent curl command:") - print(curl_command) - - -def main(): - """ - Main function to run the markdownify movements example with timing. - """ - try: - print("🎯 MARKDOWNIFY MOVEMENTS EXAMPLE") - print("=" * 60) - print("Note: Markdownify converts websites to clean markdown format") - print("Unlike Smart Scraper, it doesn't support interactive movements") - print("but excels at creating readable markdown content.") - print("This example includes comprehensive timing information.") - print() - - # Show the curl equivalent - show_curl_equivalent() - - print("\n" + "=" * 60) - - # Make the actual API request - markdownify_movements() - - print("\n" + "=" * 60) - print("Example completed!") - print("\nKey takeaways:") - print("1. Markdownify excels at converting websites to clean markdown") - print("2. Custom headers can improve scraping success") - print("3. Content analysis provides valuable insights") - print("4. File saving enables content persistence") - print("\nNext steps:") - print("- Try different websites and content types") - print("- Customize headers for specific websites") - print("- Implement content filtering and processing") - print("- Use the saved markdown files for further analysis") - - except Exception as e: - print(f"💥 Error occurred: {str(e)}") - print("\n🛠️ Troubleshooting:") - print("1. Make sure your .env file contains TEST_API_KEY") - print("2. Ensure the API server is running on localhost:8001") - print("3. Check your internet connection") - print("4. Verify the target website is accessible") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/scheduled_jobs/async/async_scheduled_jobs_advanced_example.py b/scrapegraph-py/examples/scheduled_jobs/async/async_scheduled_jobs_advanced_example.py deleted file mode 100644 index 9c46b99..0000000 --- a/scrapegraph-py/examples/scheduled_jobs/async/async_scheduled_jobs_advanced_example.py +++ /dev/null @@ -1,369 +0,0 @@ -import asyncio -import os -from datetime import datetime, timedelta -from typing import Dict, Any, List - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -class ScheduledJobManager: - """Advanced scheduled job manager with monitoring and automation""" - - def __init__(self, client: AsyncClient): - self.client = client - self.active_jobs: Dict[str, Dict[str, Any]] = {} - - async def create_monitoring_job(self, website_url: str, job_name: str, cron_expression: str) -> str: - """Create a job that monitors website changes""" - print(f"📅 Creating monitoring job for {website_url}...") - - job_config = { - "website_url": website_url, - "user_prompt": "Monitor for any changes in content, new articles, or updates. Extract the latest information.", - "render_heavy_js": True, - "headers": { - "User-Agent": "Mozilla/5.0 (compatible; MonitoringBot/1.0)" - } - } - - result = await self.client.create_scheduled_job( - job_name=job_name, - service_type="smartscraper", - cron_expression=cron_expression, - job_config=job_config, - is_active=True - ) - - job_id = result["id"] - self.active_jobs[job_id] = { - "name": job_name, - "url": website_url, - "type": "monitoring", - "created_at": datetime.now() - } - - print(f"✅ Created monitoring job with ID: {job_id}") - return job_id - - async def create_data_collection_job(self, search_prompt: str, job_name: str, cron_expression: str) -> str: - """Create a job that collects data from multiple sources""" - print(f"📅 Creating data collection job: {search_prompt}...") - - job_config = { - "user_prompt": search_prompt, - "num_results": 10, - "headers": { - "User-Agent": "Mozilla/5.0 (compatible; DataCollector/1.0)" - } - } - - result = await self.client.create_scheduled_job( - job_name=job_name, - service_type="searchscraper", - cron_expression=cron_expression, - job_config=job_config, - is_active=True - ) - - job_id = result["id"] - self.active_jobs[job_id] = { - "name": job_name, - "prompt": search_prompt, - "type": "data_collection", - "created_at": datetime.now() - } - - print(f"✅ Created data collection job with ID: {job_id}") - return job_id - - async def create_crawl_job(self, base_url: str, job_name: str, cron_expression: str) -> str: - """Create a job that crawls websites for comprehensive data""" - print(f"📅 Creating crawl job for {base_url}...") - - job_config = { - "url": base_url, - "prompt": "Extract all relevant information including titles, descriptions, links, and metadata", - "extraction_mode": True, - "depth": 3, - "max_pages": 50, - "same_domain_only": True, - "cache_website": True, - "sitemap": True - } - - result = await self.client.create_scheduled_job( - job_name=job_name, - service_type="crawl", - cron_expression=cron_expression, - job_config=job_config, - is_active=True - ) - - job_id = result["id"] - self.active_jobs[job_id] = { - "name": job_name, - "url": base_url, - "type": "crawl", - "created_at": datetime.now() - } - - print(f"✅ Created crawl job with ID: {job_id}") - return job_id - - async def monitor_job_executions(self, job_id: str, duration_minutes: int = 5): - """Monitor job executions for a specified duration""" - print(f"📊 Monitoring executions for job {job_id} for {duration_minutes} minutes...") - - start_time = datetime.now() - end_time = start_time + timedelta(minutes=duration_minutes) - - while datetime.now() < end_time: - try: - executions = await self.client.get_job_executions(job_id, page=1, page_size=10) - - if executions["executions"]: - latest_execution = executions["executions"][0] - print(f" Latest execution: {latest_execution['status']} at {latest_execution['started_at']}") - - if latest_execution.get('completed_at'): - print(f" Completed at: {latest_execution['completed_at']}") - if latest_execution.get('credits_used'): - print(f" Credits used: {latest_execution['credits_used']}") - - await asyncio.sleep(30) # Check every 30 seconds - - except Exception as e: - print(f" Error monitoring job {job_id}: {e}") - await asyncio.sleep(30) - - async def batch_trigger_jobs(self, job_ids: List[str]): - """Trigger multiple jobs concurrently""" - print(f"🚀 Triggering {len(job_ids)} jobs concurrently...") - - tasks = [self.client.trigger_scheduled_job(job_id) for job_id in job_ids] - results = await asyncio.gather(*tasks, return_exceptions=True) - - for i, result in enumerate(results): - if isinstance(result, Exception): - print(f" ❌ Failed to trigger job {job_ids[i]}: {result}") - else: - print(f" ✅ Triggered job {job_ids[i]}: {result['execution_id']}") - - async def get_job_statistics(self) -> Dict[str, Any]: - """Get comprehensive statistics about all jobs""" - print("📈 Collecting job statistics...") - - all_jobs = await self.client.get_scheduled_jobs(page=1, page_size=100) - - stats = { - "total_jobs": all_jobs["total"], - "active_jobs": 0, - "inactive_jobs": 0, - "service_types": {}, - "recent_executions": 0, - "total_credits_used": 0 - } - - for job in all_jobs["jobs"]: - if job["is_active"]: - stats["active_jobs"] += 1 - else: - stats["inactive_jobs"] += 1 - - service_type = job["service_type"] - stats["service_types"][service_type] = stats["service_types"].get(service_type, 0) + 1 - - # Get execution history for each job - try: - executions = await self.client.get_job_executions(job["id"], page=1, page_size=5) - stats["recent_executions"] += len(executions["executions"]) - - for execution in executions["executions"]: - if execution.get("credits_used"): - stats["total_credits_used"] += execution["credits_used"] - - except Exception as e: - print(f" ⚠️ Could not get executions for job {job['id']}: {e}") - - return stats - - async def cleanup_old_jobs(self, days_old: int = 7): - """Clean up jobs older than specified days""" - print(f"🧹 Cleaning up jobs older than {days_old} days...") - - cutoff_date = datetime.now() - timedelta(days=days_old) - jobs_to_delete = [] - - all_jobs = await self.client.get_scheduled_jobs(page=1, page_size=100) - - for job in all_jobs["jobs"]: - created_at = datetime.fromisoformat(job["created_at"].replace('Z', '+00:00')) - if created_at < cutoff_date: - jobs_to_delete.append(job["id"]) - - if jobs_to_delete: - print(f" Found {len(jobs_to_delete)} jobs to delete") - - for job_id in jobs_to_delete: - try: - await self.client.delete_scheduled_job(job_id) - print(f" ✅ Deleted job {job_id}") - except Exception as e: - print(f" ❌ Failed to delete job {job_id}: {e}") - else: - print(" No old jobs found to delete") - - async def export_job_configurations(self) -> List[Dict[str, Any]]: - """Export all job configurations for backup""" - print("💾 Exporting job configurations...") - - all_jobs = await self.client.get_scheduled_jobs(page=1, page_size=100) - configurations = [] - - for job in all_jobs["jobs"]: - config = { - "job_name": job["job_name"], - "service_type": job["service_type"], - "cron_expression": job["cron_expression"], - "job_config": job["job_config"], - "is_active": job["is_active"], - "created_at": job["created_at"] - } - configurations.append(config) - - print(f" Exported {len(configurations)} job configurations") - return configurations - - -async def main(): - """Main function demonstrating advanced scheduled jobs management""" - # Initialize async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - async with AsyncClient(api_key=api_key) as client: - print("🚀 Starting Advanced Scheduled Jobs Demo") - print("=" * 60) - - manager = ScheduledJobManager(client) - job_ids = [] - - try: - # Create different types of advanced jobs - print("\n📅 Creating Advanced Scheduled Jobs:") - print("-" * 40) - - # News monitoring job - news_job_id = await manager.create_monitoring_job( - website_url="https://techcrunch.com", - job_name="TechCrunch News Monitor", - cron_expression="0 */2 * * *" # Every 2 hours - ) - job_ids.append(news_job_id) - - # AI research job - ai_job_id = await manager.create_data_collection_job( - search_prompt="Latest developments in artificial intelligence and machine learning", - job_name="AI Research Collector", - cron_expression="0 8 * * 1" # Every Monday at 8 AM - ) - job_ids.append(ai_job_id) - - # E-commerce crawl job - ecommerce_job_id = await manager.create_crawl_job( - base_url="https://example-store.com", - job_name="E-commerce Product Crawler", - cron_expression="0 3 * * *" # Daily at 3 AM - ) - job_ids.append(ecommerce_job_id) - - # Get comprehensive statistics - print("\n📈 Job Statistics:") - print("-" * 40) - stats = await manager.get_job_statistics() - print(f"Total jobs: {stats['total_jobs']}") - print(f"Active jobs: {stats['active_jobs']}") - print(f"Inactive jobs: {stats['inactive_jobs']}") - print(f"Service types: {stats['service_types']}") - print(f"Recent executions: {stats['recent_executions']}") - print(f"Total credits used: {stats['total_credits_used']}") - - # Trigger jobs concurrently - print("\n🚀 Concurrent Job Triggering:") - print("-" * 40) - await manager.batch_trigger_jobs(job_ids) - - # Monitor executions - print("\n📊 Monitoring Job Executions:") - print("-" * 40) - if job_ids: - await manager.monitor_job_executions(job_ids[0], duration_minutes=2) - - # Export configurations - print("\n💾 Exporting Job Configurations:") - print("-" * 40) - configurations = await manager.export_job_configurations() - print(f"Exported {len(configurations)} configurations") - - # Demonstrate job management - print("\n🔧 Advanced Job Management:") - print("-" * 40) - - # Update job configurations - if job_ids: - print(f"Updating job {job_ids[0]}:") - await client.update_scheduled_job( - job_ids[0], - job_name="Updated TechCrunch Monitor", - cron_expression="0 */1 * * *" # Every hour - ) - print(" ✅ Job updated successfully") - - # Pause and resume - print(f"Pausing job {job_ids[0]}:") - await client.pause_scheduled_job(job_ids[0]) - print(" ✅ Job paused") - - await asyncio.sleep(1) - - print(f"Resuming job {job_ids[0]}:") - await client.resume_scheduled_job(job_ids[0]) - print(" ✅ Job resumed") - - # Cleanup demonstration (commented out to avoid deleting real jobs) - # print("\n🧹 Cleanup Demonstration:") - # print("-" * 40) - # await manager.cleanup_old_jobs(days_old=1) - - except Exception as e: - print(f"❌ Error during execution: {e}") - - finally: - # Clean up created jobs - print("\n🧹 Cleaning up created jobs:") - print("-" * 40) - for job_id in job_ids: - try: - await client.delete_scheduled_job(job_id) - print(f" ✅ Deleted job {job_id}") - except Exception as e: - print(f" ❌ Failed to delete job {job_id}: {e}") - - print("\n✅ Advanced Scheduled Jobs Demo completed!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/scheduled_jobs/async/async_scheduled_jobs_example.py b/scrapegraph-py/examples/scheduled_jobs/async/async_scheduled_jobs_example.py deleted file mode 100644 index b9ec835..0000000 --- a/scrapegraph-py/examples/scheduled_jobs/async/async_scheduled_jobs_example.py +++ /dev/null @@ -1,219 +0,0 @@ -import asyncio -import os -from datetime import datetime -from typing import Dict, Any - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -async def create_smartscraper_job(client: AsyncClient) -> str: - """Create a scheduled job for smartscraper""" - print("📅 Creating SmartScraper scheduled job...") - - job_config = { - "website_url": "https://news.ycombinator.com", - "user_prompt": "Extract the top 5 news titles and their URLs", - "render_heavy_js": False, - "headers": { - "User-Agent": "Mozilla/5.0 (compatible; ScheduledJob/1.0)" - } - } - - result = await client.create_scheduled_job( - job_name="HN Top News Scraper", - service_type="smartscraper", - cron_expression="0 */6 * * *", # Every 6 hours - job_config=job_config, - is_active=True - ) - - job_id = result["id"] - print(f"✅ Created SmartScraper job with ID: {job_id}") - return job_id - - -async def create_searchscraper_job(client: AsyncClient) -> str: - """Create a scheduled job for searchscraper""" - print("📅 Creating SearchScraper scheduled job...") - - job_config = { - "user_prompt": "Find the latest AI and machine learning news", - "num_results": 5, - "headers": { - "User-Agent": "Mozilla/5.0 (compatible; ScheduledJob/1.0)" - } - } - - result = await client.create_scheduled_job( - job_name="AI News Search", - service_type="searchscraper", - cron_expression="0 9 * * 1", # Every Monday at 9 AM - job_config=job_config, - is_active=True - ) - - job_id = result["id"] - print(f"✅ Created SearchScraper job with ID: {job_id}") - return job_id - - -async def create_crawl_job(client: AsyncClient) -> str: - """Create a scheduled job for crawl""" - print("📅 Creating Crawl scheduled job...") - - job_config = { - "url": "https://example.com", - "prompt": "Extract all product information", - "extraction_mode": True, - "depth": 2, - "max_pages": 10, - "same_domain_only": True, - "cache_website": True - } - - result = await client.create_scheduled_job( - job_name="Product Catalog Crawler", - service_type="crawl", - cron_expression="0 2 * * *", # Daily at 2 AM - job_config=job_config, - is_active=True - ) - - job_id = result["id"] - print(f"✅ Created Crawl job with ID: {job_id}") - return job_id - - -async def manage_jobs(client: AsyncClient, job_ids: list[str]): - """Demonstrate job management operations""" - print("\n🔧 Managing scheduled jobs...") - - # List all jobs - print("\n📋 Listing all scheduled jobs:") - jobs_result = await client.get_scheduled_jobs(page=1, page_size=10) - print(f"Total jobs: {jobs_result['total']}") - - for job in jobs_result["jobs"]: - print(f" - {job['job_name']} ({job['service_type']}) - Active: {job['is_active']}") - - # Get details of first job - if job_ids: - print(f"\n🔍 Getting details for job {job_ids[0]}:") - job_details = await client.get_scheduled_job(job_ids[0]) - print(f" Name: {job_details['job_name']}") - print(f" Cron: {job_details['cron_expression']}") - print(f" Next run: {job_details.get('next_run_at', 'N/A')}") - - # Pause the first job - print(f"\n⏸️ Pausing job {job_ids[0]}:") - pause_result = await client.pause_scheduled_job(job_ids[0]) - print(f" Status: {pause_result['message']}") - - # Resume the job - print(f"\n▶️ Resuming job {job_ids[0]}:") - resume_result = await client.resume_scheduled_job(job_ids[0]) - print(f" Status: {resume_result['message']}") - - # Update job configuration - print(f"\n📝 Updating job {job_ids[0]}:") - update_result = await client.update_scheduled_job( - job_ids[0], - job_name="Updated HN News Scraper", - cron_expression="0 */4 * * *" # Every 4 hours instead of 6 - ) - print(f" Updated job name: {update_result['job_name']}") - print(f" Updated cron: {update_result['cron_expression']}") - - -async def trigger_and_monitor_jobs(client: AsyncClient, job_ids: list[str]): - """Demonstrate manual job triggering and execution monitoring""" - print("\n🚀 Triggering and monitoring jobs...") - - for job_id in job_ids: - print(f"\n🎯 Manually triggering job {job_id}:") - trigger_result = await client.trigger_scheduled_job(job_id) - execution_id = trigger_result["execution_id"] - print(f" Execution ID: {execution_id}") - print(f" Message: {trigger_result['message']}") - - # Wait a bit for execution to start - await asyncio.sleep(2) - - # Get execution history - print(f"\n📊 Getting execution history for job {job_id}:") - executions = await client.get_job_executions(job_id, page=1, page_size=5) - print(f" Total executions: {executions['total']}") - - for execution in executions["executions"][:3]: # Show last 3 executions - print(f" - Execution {execution['id']}: {execution['status']}") - print(f" Started: {execution['started_at']}") - if execution.get('completed_at'): - print(f" Completed: {execution['completed_at']}") - if execution.get('credits_used'): - print(f" Credits used: {execution['credits_used']}") - - -async def cleanup_jobs(client: AsyncClient, job_ids: list[str]): - """Clean up created jobs""" - print("\n🧹 Cleaning up created jobs...") - - for job_id in job_ids: - print(f"🗑️ Deleting job {job_id}:") - delete_result = await client.delete_scheduled_job(job_id) - print(f" Status: {delete_result['message']}") - - -async def main(): - """Main function demonstrating async scheduled jobs""" - # Initialize async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - async with AsyncClient(api_key=api_key) as client: - print("🚀 Starting Async Scheduled Jobs Demo") - print("=" * 50) - - job_ids = [] - - try: - # Create different types of scheduled jobs - smartscraper_job_id = await create_smartscraper_job(client) - job_ids.append(smartscraper_job_id) - - searchscraper_job_id = await create_searchscraper_job(client) - job_ids.append(searchscraper_job_id) - - crawl_job_id = await create_crawl_job(client) - job_ids.append(crawl_job_id) - - # Manage jobs - await manage_jobs(client, job_ids) - - # Trigger and monitor jobs - await trigger_and_monitor_jobs(client, job_ids) - - except Exception as e: - print(f"❌ Error during execution: {e}") - - finally: - # Clean up - await cleanup_jobs(client, job_ids) - - print("\n✅ Async Scheduled Jobs Demo completed!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/scheduled_jobs/sync/scheduled_jobs_example.py b/scrapegraph-py/examples/scheduled_jobs/sync/scheduled_jobs_example.py deleted file mode 100644 index e0e0883..0000000 --- a/scrapegraph-py/examples/scheduled_jobs/sync/scheduled_jobs_example.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 -"""Scheduled Jobs Example - Sync Client""" - -import os -from scrapegraph_py import Client -from scrapegraph_py.models.scheduled_jobs import ServiceType - -def main(): - client = Client.from_env() - - print("🚀 ScrapeGraph AI Scheduled Jobs Example") - print("=" * 50) - - try: - print("\n📅 Creating a scheduled SmartScraper job...") - - smartscraper_config = { - "website_url": "https://example.com", - "user_prompt": "Extract the main heading and description from the page" - } - - job = client.create_scheduled_job( - job_name="Daily Example Scraping", - service_type=ServiceType.SMARTSCRAPER, - cron_expression="0 9 * * *", - job_config=smartscraper_config, - is_active=True - ) - - job_id = job["id"] - print(f"✅ Created job: {job['job_name']} (ID: {job_id})") - print(f" Next run: {job.get('next_run_at', 'Not scheduled')}") - - print("\n📅 Creating a scheduled SearchScraper job...") - - searchscraper_config = { - "user_prompt": "Find the latest news about artificial intelligence", - "num_results": 5 - } - - search_job = client.create_scheduled_job( - job_name="Weekly AI News Search", - service_type=ServiceType.SEARCHSCRAPER, - cron_expression="0 10 * * 1", - job_config=searchscraper_config, - is_active=True - ) - - search_job_id = search_job["id"] - print(f"✅ Created job: {search_job['job_name']} (ID: {search_job_id})") - - print("\n📋 Listing all scheduled jobs...") - - jobs_response = client.get_scheduled_jobs(page=1, page_size=10) - jobs = jobs_response["jobs"] - - print(f"Found {jobs_response['total']} total jobs:") - for job in jobs: - status = "🟢 Active" if job["is_active"] else "🔴 Inactive" - print(f" - {job['job_name']} ({job['service_type']}) - {status}") - print(f" Schedule: {job['cron_expression']}") - if job.get('next_run_at'): - print(f" Next run: {job['next_run_at']}") - - print(f"\n🔍 Getting details for job {job_id}...") - - job_details = client.get_scheduled_job(job_id) - print(f"Job Name: {job_details['job_name']}") - print(f"Service Type: {job_details['service_type']}") - print(f"Created: {job_details['created_at']}") - print(f"Active: {job_details['is_active']}") - - print(f"\n📝 Updating job schedule...") - - updated_job = client.update_scheduled_job( - job_id=job_id, - cron_expression="0 8 * * *", - job_name="Daily Example Scraping (Updated)" - ) - - print(f"✅ Updated job: {updated_job['job_name']}") - print(f" New schedule: {updated_job['cron_expression']}") - - print(f"\n⏸️ Pausing job {job_id}...") - - pause_result = client.pause_scheduled_job(job_id) - print(f"✅ {pause_result['message']}") - print(f" Job is now: {'Active' if pause_result['is_active'] else 'Paused'}") - - print(f"\n▶️ Resuming job {job_id}...") - - resume_result = client.resume_scheduled_job(job_id) - print(f"✅ {resume_result['message']}") - print(f" Job is now: {'Active' if resume_result['is_active'] else 'Paused'}") - if resume_result.get('next_run_at'): - print(f" Next run: {resume_result['next_run_at']}") - - print(f"\n🚀 Manually triggering job {job_id}...") - - trigger_result = client.trigger_scheduled_job(job_id) - print(f"✅ {trigger_result['message']}") - print(f" Execution ID: {trigger_result['execution_id']}") - print(f" Triggered at: {trigger_result['triggered_at']}") - - print(f"\n📊 Getting execution history for job {job_id}...") - - executions_response = client.get_job_executions( - job_id=job_id, - page=1, - page_size=5 - ) - - executions = executions_response["executions"] - print(f"Found {executions_response['total']} total executions:") - - for execution in executions: - status_emoji = { - "completed": "✅", - "failed": "❌", - "running": "🔄", - "pending": "⏳" - }.get(execution["status"], "❓") - - print(f" {status_emoji} {execution['status'].upper()}") - print(f" Started: {execution['started_at']}") - if execution.get('completed_at'): - print(f" Completed: {execution['completed_at']}") - if execution.get('credits_used'): - print(f" Credits used: {execution['credits_used']}") - - print(f"\n🔧 Filtering jobs by service type (smartscraper)...") - - filtered_jobs = client.get_scheduled_jobs( - service_type=ServiceType.SMARTSCRAPER, - is_active=True - ) - - print(f"Found {filtered_jobs['total']} active SmartScraper jobs:") - for job in filtered_jobs["jobs"]: - print(f" - {job['job_name']} (Schedule: {job['cron_expression']})") - - print(f"\n🗑️ Cleaning up - deleting created jobs...") - - delete_result1 = client.delete_scheduled_job(job_id) - print(f"✅ {delete_result1['message']} (Job 1)") - - delete_result2 = client.delete_scheduled_job(search_job_id) - print(f"✅ {delete_result2['message']} (Job 2)") - - print("\n🎉 Scheduled jobs example completed successfully!") - - except Exception as e: - print(f"\n❌ Error: {str(e)}") - raise - - finally: - client.close() - - -if __name__ == "__main__": - if os.getenv("SGAI_MOCK", "0").lower() in ["1", "true", "yes"]: - print("🧪 Running in MOCK mode - no real API calls will be made") - - main() \ No newline at end of file diff --git a/scrapegraph-py/examples/scrape/async/async_scrape_example.py b/scrapegraph-py/examples/scrape/async/async_scrape_example.py deleted file mode 100644 index 11ab1f5..0000000 --- a/scrapegraph-py/examples/scrape/async/async_scrape_example.py +++ /dev/null @@ -1,331 +0,0 @@ -""" -Basic asynchronous example demonstrating how to use the Scrape API. - -This example shows: -1. How to make async scrape requests -2. How to process multiple URLs concurrently -3. How to use render_heavy_js for JavaScript-heavy websites -4. How to use branding parameter -5. How to add custom headers in async mode - -Equivalent curl command: -curl -X POST https://api.scrapegraphai.com/v1/scrape \ - -H "Content-Type: application/json" \ - -H "SGAI-APIKEY: your-api-key-here" \ - -d '{ - "website_url": "https://www.cubic.dev/", - "render_heavy_js": false, - "branding": true - }' - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- aiohttp -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -import time -from pathlib import Path -from typing import List, Dict, Any -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -async def basic_async_scrape(): - """Demonstrate basic async scrape functionality.""" - print("🌐 Basic Async Scrape Example") - print("=" * 35) - - async with AsyncClient.from_env() as client: - try: - print("Making basic async scrape request...") - result = await client.scrape( - website_url="https://example.com", - render_heavy_js=False - ) - - html_content = result.get("html", "") - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def async_scrape_with_heavy_js(): - """Demonstrate async scraping with heavy JavaScript rendering.""" - print("\n🚀 Async Heavy JavaScript Rendering Example") - print("=" * 50) - - async with AsyncClient.from_env() as client: - try: - print("Making async scrape request with heavy JS rendering...") - start_time = time.time() - - result = await client.scrape( - website_url="https://example.com", - render_heavy_js=True - ) - - execution_time = time.time() - start_time - html_content = result.get("html", "") - - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"⏱️ Execution time: {execution_time:.2f} seconds") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def scrape_single_url(client: AsyncClient, url: str, use_js: bool = False) -> Dict[str, Any]: - """Scrape a single URL with error handling.""" - try: - result = await client.scrape( - website_url=url, - render_heavy_js=use_js - ) - - html_content = result.get("html", "") - return { - "url": url, - "success": True, - "html_length": len(html_content), - "request_id": result.get("request_id"), - "result": result - } - - except Exception as e: - return { - "url": url, - "success": False, - "error": str(e), - "html_length": 0 - } - - -async def concurrent_scraping_example(): - """Demonstrate scraping multiple URLs concurrently.""" - print("\n⚡ Concurrent Scraping Example") - print("=" * 35) - - # URLs to scrape concurrently - urls = [ - "https://example.com", - "https://httpbin.org/html", - "https://httpbin.org/json" - ] - - async with AsyncClient.from_env() as client: - print(f"Scraping {len(urls)} URLs concurrently...") - start_time = time.time() - - # Create tasks for concurrent execution - tasks = [scrape_single_url(client, url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - - total_time = time.time() - start_time - - # Process results - successful = 0 - total_html_length = 0 - - for result in results: - if isinstance(result, Exception): - print(f"❌ Exception: {result}") - continue - - if result["success"]: - successful += 1 - total_html_length += result["html_length"] - print(f"✅ {result['url']}: {result['html_length']:,} chars") - else: - print(f"❌ {result['url']}: {result['error']}") - - print(f"\n📊 Results:") - print(f" Total time: {total_time:.2f} seconds") - print(f" Successful: {successful}/{len(urls)}") - print(f" Total HTML: {total_html_length:,} characters") - print(f" Average per URL: {total_time/len(urls):.2f} seconds") - - return results - - -async def async_scrape_with_branding(): - """Demonstrate async scraping with branding enabled.""" - print("\n🏷️ Async Branding Example") - print("=" * 30) - - async with AsyncClient.from_env() as client: - try: - print("Making async scrape request with branding enabled...") - result = await client.scrape( - website_url="https://www.cubic.dev/", - render_heavy_js=False, - branding=True - ) - - html_content = result.get("html", "") - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def async_scrape_with_custom_headers(): - """Demonstrate async scraping with custom headers.""" - print("\n🔧 Async Custom Headers Example") - print("=" * 35) - - # Custom headers - custom_headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Connection": "keep-alive" - } - - async with AsyncClient.from_env() as client: - try: - print("Making async scrape request with custom headers...") - result = await client.scrape( - website_url="https://httpbin.org/headers", - render_heavy_js=False, - headers=custom_headers - ) - - html_content = result.get("html", "") - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def save_html_to_file_async(html_content: str, filename: str): - """Save HTML content to a file asynchronously.""" - output_dir = Path("async_scrape_output") - output_dir.mkdir(exist_ok=True) - - file_path = output_dir / f"{filename}.html" - - # Use asyncio.to_thread for file I/O - await asyncio.to_thread( - lambda: file_path.write_text(html_content, encoding="utf-8") - ) - - print(f"💾 HTML saved to: {file_path}") - return file_path - - -def demonstrate_curl_equivalent(): - """Show the equivalent curl commands.""" - print("🌐 Equivalent curl commands:") - print("=" * 35) - - print("1. Basic async scrape (same as sync):") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": false") - print(" }'") - - print("\n2. With branding enabled:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://www.cubic.dev/\",") - print(" \"render_heavy_js\": false,") - print(" \"branding\": true") - print(" }'") - - print("\n3. Multiple concurrent requests:") - print("# Run multiple curl commands in parallel:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{\"website_url\": \"https://example.com\"}' &") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{\"website_url\": \"https://httpbin.org/html\"}' &") - print("wait # Wait for all background jobs to complete") - - -async def main(): - """Main async function demonstrating scrape functionality.""" - print("🚀 Async Scrape API Examples") - print("=" * 30) - - # Show curl equivalents first - demonstrate_curl_equivalent() - - try: - # Run async examples - result1 = await basic_async_scrape() - result2 = await async_scrape_with_heavy_js() - result3 = await async_scrape_with_branding() - result4 = await async_scrape_with_custom_headers() - concurrent_results = await concurrent_scraping_example() - - # Save results if successful - if result1: - html1 = result1.get("html", "") - if html1: - await save_html_to_file_async(html1, "basic_async_scrape") - - if result3: - html3 = result3.get("html", "") - if html3: - await save_html_to_file_async(html3, "branding_async_scrape") - - if result4: - html4 = result4.get("html", "") - if html4: - await save_html_to_file_async(html4, "custom_headers_async_scrape") - - print("\n🎯 Summary:") - print(f"✅ Basic async scrape: {'Success' if result1 else 'Failed'}") - print(f"✅ Heavy JS async scrape: {'Success' if result2 else 'Failed'}") - print(f"✅ Branding async scrape: {'Success' if result3 else 'Failed'}") - print(f"✅ Custom headers async scrape: {'Success' if result4 else 'Failed'}") - print(f"✅ Concurrent scraping: {'Success' if concurrent_results else 'Failed'}") - - except Exception as e: - print(f"❌ Unexpected error: {str(e)}") - - print("\n📚 Next steps:") - print("• Try running multiple curl commands in parallel") - print("• Experiment with different concurrency levels") - print("• Test with your own list of URLs") - print("• Compare async vs sync performance for multiple URLs") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/scrape/sync/scrape_example.py b/scrapegraph-py/examples/scrape/sync/scrape_example.py deleted file mode 100644 index 4b3e4c4..0000000 --- a/scrapegraph-py/examples/scrape/sync/scrape_example.py +++ /dev/null @@ -1,272 +0,0 @@ -""" -Basic synchronous example demonstrating how to use the Scrape API. - -This example shows: -1. How to make a basic scrape request -2. How to use render_heavy_js for JavaScript-heavy websites -3. How to use branding parameter -4. How to add custom headers -5. How to handle the response - -Equivalent curl command: -curl -X POST https://api.scrapegraphai.com/v1/scrape \ - -H "Content-Type: application/json" \ - -H "SGAI-APIKEY: your-api-key-here" \ - -d '{ - "website_url": "https://example.com", - "render_heavy_js": false, - "branding": true - }' - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import time -from pathlib import Path -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def basic_scrape_example(): - """Demonstrate basic scrape functionality.""" - print("🌐 Basic Scrape Example") - print("=" * 30) - - # Initialize client - client = Client.from_env() - - try: - # Basic scrape request - print("Making basic scrape request...") - result = client.scrape( - website_url="https://example.com", - render_heavy_js=False - ) - - # Display results - html_content = result.get("html", "") - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def scrape_with_heavy_js(): - """Demonstrate scraping with heavy JavaScript rendering.""" - print("\n🚀 Heavy JavaScript Rendering Example") - print("=" * 45) - - client = Client.from_env() - - try: - print("Making scrape request with heavy JS rendering...") - start_time = time.time() - - result = client.scrape( - website_url="https://example.com", - render_heavy_js=True # Enable JavaScript rendering - ) - - execution_time = time.time() - start_time - html_content = result.get("html", "") - - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"⏱️ Execution time: {execution_time:.2f} seconds") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def scrape_with_branding(): - """Demonstrate scraping with branding enabled.""" - print("\n🏷️ Branding Example") - print("=" * 30) - - client = Client.from_env() - - try: - print("Making scrape request with branding enabled...") - result = client.scrape( - website_url="https://www.cubic.dev/", - render_heavy_js=False, - branding=True - ) - - html_content = result.get("html", "") - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - # Show a preview of the HTML - preview = html_content[:200].replace('\n', ' ').strip() - print(f"HTML Preview: {preview}...") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def scrape_with_custom_headers(): - """Demonstrate scraping with custom headers.""" - print("\n🔧 Custom Headers Example") - print("=" * 30) - - client = Client.from_env() - - # Custom headers for better compatibility - custom_headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1" - } - - try: - print("Making scrape request with custom headers...") - result = client.scrape( - website_url="https://httpbin.org/html", - render_heavy_js=False, - headers=custom_headers - ) - - html_content = result.get("html", "") - print(f"✅ Success! Received {len(html_content):,} characters of HTML") - print(f"Request ID: {result.get('request_id', 'N/A')}") - - # Show a preview of the HTML - preview = html_content[:200].replace('\n', ' ').strip() - print(f"HTML Preview: {preview}...") - - return result - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def save_html_to_file(html_content: str, filename: str): - """Save HTML content to a file.""" - output_dir = Path("scrape_output") - output_dir.mkdir(exist_ok=True) - - file_path = output_dir / f"{filename}.html" - with open(file_path, "w", encoding="utf-8") as f: - f.write(html_content) - - print(f"💾 HTML saved to: {file_path}") - return file_path - - -def demonstrate_curl_equivalent(): - """Show the equivalent curl commands.""" - print("\n🌐 Equivalent curl commands:") - print("=" * 35) - - print("1. Basic scrape:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": false") - print(" }'") - - print("\n2. With heavy JS rendering:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": true") - print(" }'") - - print("\n3. With branding enabled:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://www.cubic.dev/\",") - print(" \"render_heavy_js\": false,") - print(" \"branding\": true") - print(" }'") - - -def main(): - """Main function demonstrating scrape functionality.""" - print("🚀 Scrape API Examples") - print("=" * 25) - - # Show curl equivalents first - demonstrate_curl_equivalent() - - try: - # Run examples - result1 = basic_scrape_example() - result2 = scrape_with_heavy_js() - result3 = scrape_with_branding() - result4 = scrape_with_custom_headers() - - # Save results if successful - if result1: - html1 = result1.get("html", "") - if html1: - save_html_to_file(html1, "basic_scrape") - - if result3: - html3 = result3.get("html", "") - if html3: - save_html_to_file(html3, "branding_scrape") - - if result4: - html4 = result4.get("html", "") - if html4: - save_html_to_file(html4, "custom_headers_scrape") - - print("\n🎯 Summary:") - print(f"✅ Basic scrape: {'Success' if result1 else 'Failed'}") - print(f"✅ Heavy JS scrape: {'Success' if result2 else 'Failed'}") - print(f"✅ Branding scrape: {'Success' if result3 else 'Failed'}") - print(f"✅ Custom headers scrape: {'Success' if result4 else 'Failed'}") - - except Exception as e: - print(f"❌ Unexpected error: {str(e)}") - - print("\n📚 Next steps:") - print("• Try the curl commands in your terminal") - print("• Experiment with different websites") - print("• Test with your own custom headers") - print("• Compare render_heavy_js=true vs false for dynamic sites") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/searchscraper/async/async_searchscraper_example.py b/scrapegraph-py/examples/searchscraper/async/async_searchscraper_example.py deleted file mode 100644 index c332631..0000000 --- a/scrapegraph-py/examples/searchscraper/async/async_searchscraper_example.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Example of using the async searchscraper functionality to search for information concurrently. - -This example demonstrates the configurable website limits feature: -- Default: 3 websites (30 credits) -- Enhanced: 5 websites (50 credits) - for better research depth -- Maximum: 20 websites (200 credits) - for comprehensive research -""" - -import asyncio - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - - -async def main(): - # Initialize async client - sgai_client = AsyncClient(api_key="your-api-key-here") - - # List of search queries with different website limits for demonstration - queries = [ - ("What is the latest version of Python and what are its main features?", 3), - ("What are the key differences between Python 2 and Python 3?", 5), - ("What is Python's GIL and how does it work?", 3), - ] - - # Create tasks for concurrent execution with configurable website limits - tasks = [ - sgai_client.searchscraper(user_prompt=query, num_results=num_results) - for query, num_results in queries - ] - - # Execute requests concurrently - responses = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for i, response in enumerate(responses): - if isinstance(response, Exception): - print(f"\nError for query {i+1}: {response}") - else: - query, num_results = queries[i] - print(f"\nSearch {i+1}:") - print(f"Query: {query}") - print( - f"Websites searched: {num_results} (Credits: {30 if num_results <= 3 else 30 + (num_results - 3) * 10})" - ) - print(f"Result: {response['result']}") - print("Reference URLs:") - for url in response["reference_urls"]: - print(f"- {url}") - - await sgai_client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/searchscraper/async/async_searchscraper_markdown_example.py b/scrapegraph-py/examples/searchscraper/async/async_searchscraper_markdown_example.py deleted file mode 100644 index aec4437..0000000 --- a/scrapegraph-py/examples/searchscraper/async/async_searchscraper_markdown_example.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python3 -""" -Async SearchScraper Markdown Example - -This example demonstrates using the async SearchScraper API in markdown mode -to search and scrape web pages, returning raw markdown content instead of -AI-extracted data. - -Features demonstrated: -- Async search and scrape with markdown output -- Polling for async results -- Error handling with async operations -- Cost-effective: Only 2 credits per page (vs 10 credits for AI extraction) - -Requirements: -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -import os -from typing import Optional - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -async def wait_for_completion( - client: AsyncClient, request_id: str, max_wait_time: int = 60 -) -> Optional[dict]: - """ - Poll for completion of an async SearchScraper request. - - Args: - client: The AsyncClient instance - request_id: The request ID to poll for - max_wait_time: Maximum time to wait in seconds - - Returns: - The completed response or None if timeout - """ - import time - - start_time = time.time() - - while time.time() - start_time < max_wait_time: - try: - result = await client.get_searchscraper(request_id) - - if result.get("status") == "completed": - return result - elif result.get("status") == "failed": - print(f"❌ Request failed: {result.get('error', 'Unknown error')}") - return None - else: - print(f"⏳ Status: {result.get('status', 'processing')}... waiting 5 seconds") - await asyncio.sleep(5) - - except Exception as e: - print(f"⚠️ Error polling for results: {str(e)}") - await asyncio.sleep(5) - - print("⏰ Timeout waiting for completion") - return None - - -async def basic_searchscraper_markdown_example() -> bool: - """ - Run a basic SearchScraper example with markdown output. - - Returns: - bool: True if successful, False otherwise - """ - print("🔍 Async SearchScraper Markdown Example") - print("=" * 50) - - # Initialize the async client with API key from environment - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ SGAI_API_KEY not found in environment variables.") - print("Please create a .env file with: SGAI_API_KEY=your_api_key_here") - return False - - async with AsyncClient(api_key=api_key) as client: - try: - # Configuration - user_prompt = "Latest developments in artificial intelligence" - num_results = 3 - - print(f"📝 Query: {user_prompt}") - print(f"📊 Results: {num_results} websites") - print("🔧 Mode: Markdown conversion") - print("💰 Cost: 2 credits per page (vs 10 for AI extraction)") - - # Send a searchscraper request in markdown mode - response = await client.searchscraper( - user_prompt=user_prompt, - num_results=num_results, - extraction_mode=False, # False = markdown mode, True = AI extraction mode - ) - - print(f"\n✅ SearchScraper request submitted successfully!") - print(f"📄 Request ID: {response.get('request_id', 'N/A')}") - - # Check if this is an async request that needs polling - if 'request_id' in response and 'status' not in response: - print("⏳ Waiting for async processing to complete...") - - # Poll for completion - final_result = await wait_for_completion(client, response['request_id']) - - if final_result: - response = final_result - else: - print("❌ Failed to get completed results") - return False - - # Display results - if response.get("status") == "completed": - print("\n🎉 SearchScraper markdown completed successfully!") - - # Display markdown content (first 500 chars) - markdown_content = response.get("markdown_content", "") - if markdown_content: - print("\n📝 Markdown Content Preview:") - print(f"{markdown_content[:500]}{'...' if len(markdown_content) > 500 else ''}") - else: - print("⚠️ No markdown content returned") - - # Display reference URLs - reference_urls = response.get("reference_urls", []) - if reference_urls: - print(f"\n🔗 References: {len(reference_urls)}") - print("\n🔗 Reference URLs:") - for i, url in enumerate(reference_urls, 1): - print(f" {i}. {url}") - else: - print("⚠️ No reference URLs returned") - - return True - else: - print(f"❌ Request not completed. Status: {response.get('status', 'unknown')}") - return False - - except Exception as e: - print(f"❌ Error: {str(e)}") - return False - - -async def main(): - """Main function to run the example.""" - success = await basic_searchscraper_markdown_example() - return success - - -if __name__ == "__main__": - success = asyncio.run(main()) - exit(0 if success else 1) - diff --git a/scrapegraph-py/examples/searchscraper/async/async_searchscraper_schema_example.py b/scrapegraph-py/examples/searchscraper/async/async_searchscraper_schema_example.py deleted file mode 100644 index 385078e..0000000 --- a/scrapegraph-py/examples/searchscraper/async/async_searchscraper_schema_example.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -Example of using the async searchscraper functionality with output schemas for extraction. - -This example demonstrates both schema-based output and configurable website limits: -- Using different website limits for different complexity levels -- Enhanced searches provide better data for complex schema population -- Concurrent processing of multiple schema-based searches -""" - -import asyncio -from typing import List - -from pydantic import BaseModel - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - - -# Define schemas for extracting structured data -class PythonVersionInfo(BaseModel): - version: str - release_date: str - major_features: List[str] - - -class PythonComparison(BaseModel): - key_differences: List[str] - backward_compatible: bool - migration_difficulty: str - - -class GILInfo(BaseModel): - definition: str - purpose: str - limitations: List[str] - workarounds: List[str] - - -async def main(): - # Initialize async client - sgai_client = AsyncClient(api_key="your-api-key-here") - - # Define search queries with their corresponding schemas and website limits - searches = [ - { - "prompt": "What is the latest version of Python? Include the release date and main features.", - "schema": PythonVersionInfo, - "num_results": 4, # Moderate search for version info (40 credits) - }, - { - "prompt": "Compare Python 2 and Python 3, including backward compatibility and migration difficulty.", - "schema": PythonComparison, - "num_results": 6, # Enhanced search for comparison (60 credits) - }, - { - "prompt": "Explain Python's GIL, its purpose, limitations, and possible workarounds.", - "schema": GILInfo, - "num_results": 8, # Deep search for technical details (80 credits) - }, - ] - - print("🚀 Starting concurrent schema-based searches with configurable limits:") - for i, search in enumerate(searches, 1): - credits = ( - 30 if search["num_results"] <= 3 else 30 + (search["num_results"] - 3) * 10 - ) - print( - f" {i}. {search['num_results']} websites ({credits} credits): {search['prompt'][:50]}..." - ) - print() - - # Create tasks for concurrent execution with configurable website limits - tasks = [ - sgai_client.searchscraper( - user_prompt=search["prompt"], - num_results=search["num_results"], - output_schema=search["schema"], - ) - for search in searches - ] - - # Execute requests concurrently - responses = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for i, response in enumerate(responses): - if isinstance(response, Exception): - print(f"\nError for search {i+1}: {response}") - else: - print(f"\nSearch {i+1}:") - print(f"Query: {searches[i]['prompt']}") - # print(f"Raw Result: {response['result']}") - - try: - # Try to extract structured data using the schema - result = searches[i]["schema"].model_validate(response["result"]) - - # Print extracted structured data - if isinstance(result, PythonVersionInfo): - print("\nExtracted Data:") - print(f"Python Version: {result.version}") - print(f"Release Date: {result.release_date}") - print("Major Features:") - for feature in result.major_features: - print(f"- {feature}") - - elif isinstance(result, PythonComparison): - print("\nExtracted Data:") - print("Key Differences:") - for diff in result.key_differences: - print(f"- {diff}") - print(f"Backward Compatible: {result.backward_compatible}") - print(f"Migration Difficulty: {result.migration_difficulty}") - - elif isinstance(result, GILInfo): - print("\nExtracted Data:") - print(f"Definition: {result.definition}") - print(f"Purpose: {result.purpose}") - print("Limitations:") - for limit in result.limitations: - print(f"- {limit}") - print("Workarounds:") - for workaround in result.workarounds: - print(f"- {workaround}") - except Exception as e: - print(f"\nCould not extract structured data: {e}") - - print("\nReference URLs:") - for url in response["reference_urls"]: - print(f"- {url}") - - await sgai_client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/searchscraper/sync/searchscraper_example.py b/scrapegraph-py/examples/searchscraper/sync/searchscraper_example.py deleted file mode 100644 index 6fd735b..0000000 --- a/scrapegraph-py/examples/searchscraper/sync/searchscraper_example.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Example of using the searchscraper functionality to search for information. - -This example demonstrates the configurable website limits feature: -- Default: 3 websites (30 credits) -- Enhanced: 5 websites (50 credits) - uncomment to try -- Maximum: 20 websites (200 credits) - for comprehensive research - -Requirements: -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import os - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - -# Initialize the client with API key from environment -api_key = os.getenv("SGAI_API_KEY") -if not api_key: - raise ValueError( - "SGAI_API_KEY not found in environment variables. Please create a .env file with: SGAI_API_KEY=your_api_key_here" - ) - -client = Client(api_key=api_key) - -# Send a searchscraper request with configurable website limits -response = client.searchscraper( - user_prompt="What is the latest version of Python and what are its main features?", - num_results=3, # Default: 3 websites (30 credits) - # num_results=5 # Enhanced: 5 websites (50 credits) - uncomment for more comprehensive results - # num_results=10 # Deep research: 10 websites (100 credits) - uncomment for extensive research -) - -# Print the results -print("\nResults:") -print(f"Answer: {response['result']}") -print("\nReference URLs:") -for url in response["reference_urls"]: - print(f"- {url}") - -# Close the client -client.close() diff --git a/scrapegraph-py/examples/searchscraper/sync/searchscraper_markdown_example.py b/scrapegraph-py/examples/searchscraper/sync/searchscraper_markdown_example.py deleted file mode 100644 index 3e4cb55..0000000 --- a/scrapegraph-py/examples/searchscraper/sync/searchscraper_markdown_example.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -""" -Basic SearchScraper Markdown Example - -This example demonstrates the simplest way to use the SearchScraper API -in markdown mode to search and scrape web pages, returning raw markdown content -instead of AI-extracted data. - -Features demonstrated: -- Basic search and scrape with markdown output -- Simple error handling -- Minimal code approach -- Cost-effective: Only 2 credits per page (vs 10 credits for AI extraction) - -Requirements: -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import os - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -def main(): - """Run a basic SearchScraper example with markdown output.""" - print("🔍 Basic SearchScraper Markdown Example") - print("=" * 50) - - # Initialize the client with API key from environment - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ SGAI_API_KEY not found in environment variables.") - print("Please create a .env file with: SGAI_API_KEY=your_api_key_here") - return False - - client = Client(api_key=api_key) - - try: - # Configuration - user_prompt = "Latest developments in artificial intelligence" - num_results = 3 - - print(f"📝 Query: {user_prompt}") - print(f"📊 Results: {num_results} websites") - print("🔧 Mode: Markdown conversion") - print("💰 Cost: 2 credits per page (vs 10 for AI extraction)") - - # Send a searchscraper request in markdown mode - response = client.searchscraper( - user_prompt=user_prompt, - num_results=num_results, - extraction_mode=False, # False = markdown mode, True = AI extraction mode - ) - - print("\n✅ SearchScraper markdown completed successfully!") - print(f"📄 Request ID: {response.get('request_id', 'N/A')}") - - # For async requests, you would need to poll for results - if 'request_id' in response: - print("📝 This is an async request. Use get_searchscraper() to retrieve results.") - print(f"🔍 Use: client.get_searchscraper('{response['request_id']}')") - else: - # If it's a sync response, display the results - if 'markdown_content' in response: - markdown_content = response.get("markdown_content", "") - print(f"\n📝 Markdown Content Preview:") - print(f"{markdown_content[:500]}{'...' if len(markdown_content) > 500 else ''}") - - if 'reference_urls' in response: - print(f"\n🔗 References: {len(response.get('reference_urls', []))}") - print("\n🔗 Reference URLs:") - for i, url in enumerate(response.get("reference_urls", []), 1): - print(f" {i}. {url}") - - return True - - except Exception as e: - print(f"❌ Error: {str(e)}") - return False - - finally: - # Close the client - client.close() - - -if __name__ == "__main__": - success = main() - exit(0 if success else 1) - diff --git a/scrapegraph-py/examples/searchscraper/sync/searchscraper_schema_example.py b/scrapegraph-py/examples/searchscraper/sync/searchscraper_schema_example.py deleted file mode 100644 index fbc5422..0000000 --- a/scrapegraph-py/examples/searchscraper/sync/searchscraper_schema_example.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Example of using the searchscraper functionality with a custom output schema. - -This example demonstrates both schema-based output and configurable website limits: -- Default: 3 websites (30 credits) -- Enhanced: 5 websites (50 credits) - provides more comprehensive data for schema -- Maximum: 20 websites (200 credits) - for highly detailed schema population -""" - -from typing import List - -from pydantic import BaseModel - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - - -# Define a custom schema for the output -class PythonVersionInfo(BaseModel): - version: str - release_date: str - major_features: List[str] - is_latest: bool - - -# Initialize the client -client = Client(api_key="your-api-key-here") - -# Send a searchscraper request with schema and configurable website limits -num_results = 5 # Enhanced search for better schema data (50 credits) -print(f"🔍 Searching {num_results} websites with custom schema") -print(f"💳 Credits required: {30 if num_results <= 3 else 30 + (num_results - 3) * 10}") - -response = client.searchscraper( - user_prompt="What is the latest version of Python? Include the release date and main features.", - num_results=num_results, # More websites for better schema population - output_schema=PythonVersionInfo, -) - -# The result will be structured according to our schema -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") - -print("\nReference URLs:") -for url in response["reference_urls"]: - print(f"- {url}") - -# Close the client -client.close() diff --git a/scrapegraph-py/examples/sitemap/async/async_sitemap_example.py b/scrapegraph-py/examples/sitemap/async/async_sitemap_example.py deleted file mode 100644 index f9d986e..0000000 --- a/scrapegraph-py/examples/sitemap/async/async_sitemap_example.py +++ /dev/null @@ -1,276 +0,0 @@ -""" -Asynchronous example demonstrating how to use the Sitemap API. - -This example shows: -1. How to extract URLs from a website's sitemap asynchronously -2. How to process multiple sitemaps concurrently -3. How to combine sitemap with async smartscraper operations - -The Sitemap API automatically discovers the sitemap from: -- robots.txt file -- Common locations like /sitemap.xml -- Sitemap index files - -Requirements: -- Python 3.10+ -- scrapegraph-py -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -from pathlib import Path -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -async def basic_sitemap_example(): - """Demonstrate basic async sitemap extraction.""" - print("🗺️ Basic Async Sitemap Example") - print("=" * 40) - - async with AsyncClient.from_env() as client: - try: - # Extract sitemap URLs - print("Extracting sitemap from https://scrapegraphai.com...") - response = await client.sitemap(website_url="https://scrapegraphai.com") - - # Display results - print(f"✅ Success! Found {len(response.urls)} URLs\n") - - # Show first 10 URLs - print("First 10 URLs:") - for i, url in enumerate(response.urls[:10], 1): - print(f" {i}. {url}") - - if len(response.urls) > 10: - print(f" ... and {len(response.urls) - 10} more URLs") - - return response - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def save_urls_to_file(urls: list[str], filename: str): - """Save sitemap URLs to a text file asynchronously.""" - output_dir = Path("sitemap_output") - output_dir.mkdir(exist_ok=True) - - file_path = output_dir / f"{filename}.txt" - - # Use asyncio to write file asynchronously - loop = asyncio.get_event_loop() - await loop.run_in_executor( - None, - lambda: file_path.write_text("\n".join(urls), encoding="utf-8") - ) - - print(f"💾 URLs saved to: {file_path}") - return file_path - - -async def concurrent_sitemaps_example(): - """Demonstrate extracting multiple sitemaps concurrently.""" - print("\n⚡ Concurrent Sitemaps Example") - print("=" * 40) - - websites = [ - "https://scrapegraphai.com", - "https://example.com", - "https://python.org" - ] - - async with AsyncClient.from_env() as client: - try: - print(f"Extracting sitemaps from {len(websites)} websites concurrently...") - - # Create tasks for concurrent execution - tasks = [ - client.sitemap(website_url=url) - for url in websites - ] - - # Execute all tasks concurrently - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - successful = 0 - for url, result in zip(websites, results): - if isinstance(result, Exception): - print(f"❌ {url}: {str(result)}") - else: - print(f"✅ {url}: {len(result.urls)} URLs") - successful += 1 - - print(f"\n📊 Summary: {successful}/{len(websites)} successful") - - return [r for r in results if not isinstance(r, Exception)] - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def filter_and_scrape_example(): - """Demonstrate filtering sitemap URLs and scraping them asynchronously.""" - print("\n🤖 Filter + Async Scrape Example") - print("=" * 40) - - async with AsyncClient.from_env() as client: - try: - # Extract sitemap - print("Step 1: Extracting sitemap...") - response = await client.sitemap(website_url="https://scrapegraphai.com") - - # Filter for specific URLs - target_urls = [url for url in response.urls if '/blog/' in url][:3] - - if not target_urls: - target_urls = response.urls[:3] - - print(f"✅ Found {len(response.urls)} URLs") - print(f"🎯 Selected {len(target_urls)} URLs to scrape\n") - - # Create scraping tasks - print("Step 2: Scraping URLs concurrently...") - - async def scrape_url(url): - """Scrape a single URL.""" - try: - result = await client.smartscraper( - website_url=url, - user_prompt="Extract the page title and main heading" - ) - return { - 'url': url, - 'data': result.get('result'), - 'status': 'success' - } - except Exception as e: - return { - 'url': url, - 'error': str(e), - 'status': 'failed' - } - - # Execute scraping tasks concurrently - tasks = [scrape_url(url) for url in target_urls] - results = await asyncio.gather(*tasks) - - # Display results - successful = sum(1 for r in results if r['status'] == 'success') - print(f"\n📊 Summary:") - print(f" ✅ Successful: {successful}/{len(results)}") - print(f" ❌ Failed: {len(results) - successful}/{len(results)}") - - # Show sample results - print("\nSample results:") - for i, result in enumerate(results[:3], 1): - print(f"\n {i}. {result['url']}") - if result['status'] == 'success': - print(f" Data: {result['data']}") - else: - print(f" Error: {result['error']}") - - return results - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def batch_process_with_rate_limit(): - """Demonstrate batch processing with rate limiting.""" - print("\n⏱️ Batch Processing with Rate Limit") - print("=" * 40) - - async with AsyncClient.from_env() as client: - try: - # Extract sitemap - print("Extracting sitemap...") - response = await client.sitemap(website_url="https://scrapegraphai.com") - - # Get URLs to process - urls_to_process = response.urls[:10] - print(f"Processing {len(urls_to_process)} URLs with rate limiting...") - - # Process in batches to avoid overwhelming the API - batch_size = 3 - results = [] - - for i in range(0, len(urls_to_process), batch_size): - batch = urls_to_process[i:i + batch_size] - print(f"\nProcessing batch {i // batch_size + 1}...") - - # Process batch - batch_tasks = [ - client.smartscraper( - website_url=url, - user_prompt="Extract title" - ) - for url in batch - ] - - batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True) - results.extend(batch_results) - - # Rate limiting: wait between batches - if i + batch_size < len(urls_to_process): - print("Waiting 2 seconds before next batch...") - await asyncio.sleep(2) - - successful = sum(1 for r in results if not isinstance(r, Exception)) - print(f"\n✅ Processed {successful}/{len(results)} URLs successfully") - - return results - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - - -async def main(): - """Main function demonstrating async sitemap functionality.""" - print("🚀 Async Sitemap API Examples") - print("=" * 40) - - try: - # Basic sitemap extraction - response = await basic_sitemap_example() - - if response and response.urls: - # Save URLs to file - await save_urls_to_file(response.urls, "async_scrapegraphai_sitemap") - - # Concurrent sitemaps - await concurrent_sitemaps_example() - - # Filter and scrape - await filter_and_scrape_example() - - # Batch processing with rate limit - await batch_process_with_rate_limit() - - print("\n🎯 All examples completed!") - - except Exception as e: - print(f"❌ Unexpected error: {str(e)}") - - print("\n📚 Next steps:") - print("• Experiment with different websites") - print("• Adjust batch sizes for your use case") - print("• Combine with other async operations") - print("• Implement custom error handling and retry logic") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/sitemap/sync/sitemap_example.py b/scrapegraph-py/examples/sitemap/sync/sitemap_example.py deleted file mode 100644 index ea963f6..0000000 --- a/scrapegraph-py/examples/sitemap/sync/sitemap_example.py +++ /dev/null @@ -1,251 +0,0 @@ -""" -Basic synchronous example demonstrating how to use the Sitemap API. - -This example shows: -1. How to extract URLs from a website's sitemap -2. How to save sitemap URLs to a file -3. How to combine sitemap with other scraping operations - -The Sitemap API automatically discovers the sitemap from: -- robots.txt file -- Common locations like /sitemap.xml -- Sitemap index files - -Equivalent curl command: -curl -X POST https://api.scrapegraphai.com/v1/sitemap \ - -H "Content-Type: application/json" \ - -H "SGAI-APIKEY: your-api-key-here" \ - -d '{ - "website_url": "https://example.com" - }' - -Requirements: -- Python 3.10+ -- scrapegraph-py -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -from pathlib import Path -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def basic_sitemap_example(): - """Demonstrate basic sitemap extraction.""" - print("🗺️ Basic Sitemap Example") - print("=" * 40) - - # Initialize client - client = Client.from_env() - - try: - # Extract sitemap URLs - print("Extracting sitemap from https://scrapegraphai.com...") - response = client.sitemap(website_url="https://scrapegraphai.com") - - # Display results - print(f"✅ Success! Found {len(response.urls)} URLs\n") - - # Show first 10 URLs - print("First 10 URLs:") - for i, url in enumerate(response.urls[:10], 1): - print(f" {i}. {url}") - - if len(response.urls) > 10: - print(f" ... and {len(response.urls) - 10} more URLs") - - return response - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def save_urls_to_file(urls: list[str], filename: str): - """Save sitemap URLs to a text file.""" - output_dir = Path("sitemap_output") - output_dir.mkdir(exist_ok=True) - - file_path = output_dir / f"{filename}.txt" - with open(file_path, "w", encoding="utf-8") as f: - for url in urls: - f.write(url + "\n") - - print(f"💾 URLs saved to: {file_path}") - return file_path - - -def filter_urls_example(): - """Demonstrate filtering sitemap URLs by pattern.""" - print("\n🔍 Filtering URLs Example") - print("=" * 40) - - client = Client.from_env() - - try: - # Extract sitemap - print("Extracting sitemap...") - response = client.sitemap(website_url="https://scrapegraphai.com") - - # Filter URLs containing specific patterns - blog_urls = [url for url in response.urls if '/blog/' in url] - doc_urls = [url for url in response.urls if '/docs/' in url or '/documentation/' in url] - - print(f"✅ Total URLs: {len(response.urls)}") - print(f"📝 Blog URLs: {len(blog_urls)}") - print(f"📚 Documentation URLs: {len(doc_urls)}") - - # Show sample blog URLs - if blog_urls: - print("\nSample blog URLs:") - for url in blog_urls[:5]: - print(f" • {url}") - - return { - 'all_urls': response.urls, - 'blog_urls': blog_urls, - 'doc_urls': doc_urls - } - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def combine_with_smartscraper(): - """Demonstrate combining sitemap with smartscraper.""" - print("\n🤖 Sitemap + SmartScraper Example") - print("=" * 40) - - client = Client.from_env() - - try: - # First, get sitemap URLs - print("Step 1: Extracting sitemap...") - sitemap_response = client.sitemap(website_url="https://scrapegraphai.com") - - # Filter for specific pages (e.g., blog posts) - target_urls = [url for url in sitemap_response.urls if '/blog/' in url][:3] - - if not target_urls: - # If no blog URLs, use first 3 URLs - target_urls = sitemap_response.urls[:3] - - print(f"✅ Found {len(sitemap_response.urls)} URLs") - print(f"🎯 Selected {len(target_urls)} URLs to scrape\n") - - # Scrape selected URLs - print("Step 2: Scraping selected URLs...") - results = [] - - for i, url in enumerate(target_urls, 1): - print(f" Scraping ({i}/{len(target_urls)}): {url}") - - try: - # Use smartscraper to extract data - scrape_result = client.smartscraper( - website_url=url, - user_prompt="Extract the page title and main heading" - ) - - results.append({ - 'url': url, - 'data': scrape_result.get('result'), - 'status': 'success' - }) - print(f" ✅ Success") - - except Exception as e: - results.append({ - 'url': url, - 'error': str(e), - 'status': 'failed' - }) - print(f" ❌ Failed: {str(e)}") - - # Summary - successful = sum(1 for r in results if r['status'] == 'success') - print(f"\n📊 Summary:") - print(f" ✅ Successful: {successful}/{len(results)}") - print(f" ❌ Failed: {len(results) - successful}/{len(results)}") - - return results - - except Exception as e: - print(f"❌ Error: {str(e)}") - return None - finally: - client.close() - - -def demonstrate_curl_equivalent(): - """Show the equivalent curl command.""" - print("\n🌐 Equivalent curl command:") - print("=" * 40) - - print("curl -X POST https://api.scrapegraphai.com/v1/sitemap \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://scrapegraphai.com\"") - print(" }'") - - -def main(): - """Main function demonstrating sitemap functionality.""" - print("🚀 Sitemap API Examples") - print("=" * 40) - - # Show curl equivalent first - demonstrate_curl_equivalent() - - try: - # Run examples - print("\n" + "=" * 40 + "\n") - - # Basic sitemap extraction - response = basic_sitemap_example() - - if response and response.urls: - # Save URLs to file - save_urls_to_file(response.urls, "scrapegraphai_sitemap") - - # Filter URLs by pattern - filtered = filter_urls_example() - - if filtered: - # Save filtered URLs - if filtered['blog_urls']: - save_urls_to_file(filtered['blog_urls'], "blog_urls") - if filtered['doc_urls']: - save_urls_to_file(filtered['doc_urls'], "doc_urls") - - # Advanced: Combine with smartscraper - combine_with_smartscraper() - - print("\n🎯 All examples completed!") - - except Exception as e: - print(f"❌ Unexpected error: {str(e)}") - - print("\n📚 Next steps:") - print("• Try the curl command in your terminal") - print("• Experiment with different websites") - print("• Combine sitemap with other scraping operations") - print("• Filter URLs based on your specific needs") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/smartscraper/async/async_generate_schema_example.py b/scrapegraph-py/examples/smartscraper/async/async_generate_schema_example.py deleted file mode 100644 index 5e796a2..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_generate_schema_example.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python3 -""" -Async example script demonstrating the Generate Schema API endpoint using ScrapeGraph Python SDK. - -This script shows how to: -1. Generate a new JSON schema from a search query asynchronously -2. Modify an existing schema -3. Handle different types of search queries -4. Check the status of schema generation requests -5. Run multiple concurrent schema generations - -Requirements: -- Python 3.7+ -- scrapegraph-py package -- aiohttp -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here - -Usage: - python async_generate_schema_example.py -""" - -import asyncio -import json -import os -from typing import Any, Dict, Optional - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -class AsyncGenerateSchemaExample: - """Async example class for demonstrating the Generate Schema API using ScrapeGraph SDK""" - - def __init__(self, base_url: str = None, api_key: str = None): - # Get API key from environment if not provided - self.api_key = api_key or os.getenv("SGAI_API_KEY") - if not self.api_key: - raise ValueError( - "API key must be provided or set in .env file as SGAI_API_KEY. " - "Create a .env file with: SGAI_API_KEY=your_api_key_here" - ) - - # Initialize the ScrapeGraph async client - if base_url: - # If base_url is provided, we'll need to modify the client to use it - # For now, we'll use the default client and note the limitation - print(f"⚠️ Note: Custom base_url {base_url} not yet supported in this example") - - self.client = AsyncClient(api_key=self.api_key) - - def print_schema_response( - self, response: Dict[str, Any], title: str = "Schema Generation Response" - ): - """Pretty print the schema generation response""" - print(f"\n{'='*60}") - print(f" {title}") - print(f"{'='*60}") - - if "error" in response and response["error"]: - print(f"❌ Error: {response['error']}") - return - - print(f"✅ Request ID: {response.get('request_id', 'N/A')}") - print(f"📊 Status: {response.get('status', 'N/A')}") - print(f"🔍 User Prompt: {response.get('user_prompt', 'N/A')}") - print(f"✨ Refined Prompt: {response.get('refined_prompt', 'N/A')}") - - if "generated_schema" in response: - print(f"\n📋 Generated Schema:") - print(json.dumps(response["generated_schema"], indent=2)) - - async def run_examples(self): - """Run all the example scenarios asynchronously""" - print("🚀 Async Generate Schema API Examples using ScrapeGraph Python SDK") - print("=" * 60) - - # Example 1: Generate schema for e-commerce products - print("\n1️⃣ Example: E-commerce Product Search") - ecommerce_prompt = "Find laptops with specifications like brand, processor, RAM, storage, and price" - try: - response = await self.client.generate_schema(ecommerce_prompt) - self.print_schema_response(response, "E-commerce Products Schema") - except Exception as e: - print(f"❌ Error in e-commerce example: {e}") - - # Example 2: Generate schema for job listings - print("\n2️⃣ Example: Job Listings Search") - job_prompt = "Search for software engineering jobs with company name, position, location, salary range, and requirements" - try: - response = await self.client.generate_schema(job_prompt) - self.print_schema_response(response, "Job Listings Schema") - except Exception as e: - print(f"❌ Error in job listings example: {e}") - - # Example 3: Generate schema for news articles - print("\n3️⃣ Example: News Articles Search") - news_prompt = "Find technology news articles with headline, author, publication date, category, and summary" - try: - response = await self.client.generate_schema(news_prompt) - self.print_schema_response(response, "News Articles Schema") - except Exception as e: - print(f"❌ Error in news articles example: {e}") - - # Example 4: Modify existing schema - print("\n4️⃣ Example: Modify Existing Schema") - existing_schema = { - "$defs": { - "ProductSchema": { - "title": "ProductSchema", - "type": "object", - "properties": { - "name": {"title": "Name", "type": "string"}, - "price": {"title": "Price", "type": "number"}, - }, - "required": ["name", "price"], - } - }, - "title": "ProductList", - "type": "object", - "properties": { - "products": { - "title": "Products", - "type": "array", - "items": {"$ref": "#/$defs/ProductSchema"}, - } - }, - "required": ["products"], - } - - modification_prompt = ( - "Add brand, category, and rating fields to the existing product schema" - ) - try: - response = await self.client.generate_schema(modification_prompt, existing_schema) - self.print_schema_response(response, "Modified Product Schema") - except Exception as e: - print(f"❌ Error in schema modification example: {e}") - - # Example 5: Complex nested schema - print("\n5️⃣ Example: Complex Nested Schema") - complex_prompt = "Create a schema for a company directory with departments, each containing employees with contact info and projects" - try: - response = await self.client.generate_schema(complex_prompt) - self.print_schema_response(response, "Company Directory Schema") - except Exception as e: - print(f"❌ Error in complex schema example: {e}") - - async def run_concurrent_examples(self): - """Run multiple schema generations concurrently""" - print("\n🔄 Running Concurrent Examples...") - - # Example: Multiple concurrent schema generations - prompts = [ - "Find restaurants with name, cuisine, rating, and address", - "Search for books with title, author, genre, and publication year", - "Find movies with title, director, cast, rating, and release date", - ] - - try: - tasks = [self.client.generate_schema(prompt) for prompt in prompts] - results = await asyncio.gather(*tasks) - - for i, (prompt, result) in enumerate(zip(prompts, results), 1): - self.print_schema_response(result, f"Concurrent Example {i}: {prompt[:30]}...") - - except Exception as e: - print(f"❌ Error in concurrent examples: {e}") - - async def demonstrate_status_checking(self): - """Demonstrate how to check the status of schema generation requests""" - print("\n🔄 Demonstrating Status Checking...") - - # Generate a simple schema first - prompt = "Find restaurants with name, cuisine, rating, and address" - try: - response = await self.client.generate_schema(prompt) - request_id = response.get('request_id') - - if request_id: - print(f"📝 Generated schema request with ID: {request_id}") - - # Check the status - print("🔍 Checking status...") - status_response = await self.client.get_schema_status(request_id) - self.print_schema_response(status_response, f"Status Check for {request_id}") - else: - print("⚠️ No request ID returned from schema generation") - - except Exception as e: - print(f"❌ Error in status checking demonstration: {e}") - - async def close(self): - """Close the client to free up resources""" - if hasattr(self, 'client'): - await self.client.close() - - -async def main(): - """Main function to run the async examples""" - # Check if API key is available - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - # Initialize the example class - example = AsyncGenerateSchemaExample() - - try: - # Run synchronous examples - await example.run_examples() - - # Run concurrent examples - await example.run_concurrent_examples() - - # Demonstrate status checking - await example.demonstrate_status_checking() - - except Exception as e: - print(f"❌ Unexpected Error: {e}") - finally: - # Always close the client - await example.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_cookies_example.py b/scrapegraph-py/examples/smartscraper/async/async_smartscraper_cookies_example.py deleted file mode 100644 index b68794b..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_cookies_example.py +++ /dev/null @@ -1,131 +0,0 @@ -""" -Example demonstrating how to use the SmartScraper API with cookies (Async). - -This example shows how to: -1. Set up the API request with cookies for authentication -2. Use cookies with infinite scrolling -3. Define a Pydantic model for structured output -4. Make the API call and handle the response -5. Process the extracted data - -Requirements: -- Python 3.7+ -- scrapegraph-py -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -import json -import os -from typing import Dict - -from dotenv import load_dotenv -from pydantic import BaseModel, Field - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -# Define the data models for structured output -class CookieInfo(BaseModel): - """Model representing cookie information.""" - - cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") - - -async def main(): - """Example usage of the cookies scraper.""" - # Check if API key is available - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - # Initialize the async client - async with AsyncClient.from_env() as client: - # Example 1: Basic cookies example (httpbin.org/cookies) - print("=" * 60) - print("EXAMPLE 1: Basic Cookies Example") - print("=" * 60) - - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies info" - cookies = {"cookies_key": "cookies_value"} - - try: - # Perform the scraping with cookies - response = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - output_schema=CookieInfo, - ) - - # Print the results - print("\nExtracted Cookie Information:") - print(json.dumps(response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - # Example 2: Cookies with infinite scrolling - print("\n" + "=" * 60) - print("EXAMPLE 2: Cookies with Infinite Scrolling") - print("=" * 60) - - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies and scroll information" - cookies = {"session_id": "abc123", "user_token": "xyz789"} - - try: - # Perform the scraping with cookies and infinite scrolling - response = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - number_of_scrolls=3, - output_schema=CookieInfo, - ) - - # Print the results - print("\nExtracted Cookie Information with Scrolling:") - print(json.dumps(response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - # Example 3: Cookies with pagination - print("\n" + "=" * 60) - print("EXAMPLE 3: Cookies with Pagination") - print("=" * 60) - - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies from multiple pages" - cookies = {"auth_token": "secret123", "preferences": "dark_mode"} - - try: - # Perform the scraping with cookies and pagination - response = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - total_pages=3, - output_schema=CookieInfo, - ) - - # Print the results - print("\nExtracted Cookie Information with Pagination:") - print(json.dumps(response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_example.py b/scrapegraph-py/examples/smartscraper/async/async_smartscraper_example.py deleted file mode 100644 index f9a0f69..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_example.py +++ /dev/null @@ -1,56 +0,0 @@ -import asyncio -import os - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -async def main(): - # Initialize async client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - sgai_client = AsyncClient(api_key=api_key) - - # Concurrent scraping requests - urls = [ - "https://scrapegraphai.com/", - "https://github.com/ScrapeGraphAI/Scrapegraph-ai", - ] - - tasks = [ - sgai_client.smartscraper( - website_url=url, user_prompt="Summarize the main content" - ) - for url in urls - ] - - # Execute requests concurrently - responses = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for i, response in enumerate(responses): - if isinstance(response, Exception): - print(f"\nError for {urls[i]}: {response}") - else: - print(f"\nPage {i+1} Summary:") - print(f"URL: {urls[i]}") - print(f"Result: {response['result']}") - - await sgai_client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/smartscraper/async/async_smartscraper_infinite_scroll_example.py deleted file mode 100644 index 9e4f180..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_infinite_scroll_example.py +++ /dev/null @@ -1,62 +0,0 @@ -import asyncio - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - - -async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: - """Scrape companies from a specific YC batch with infinite scroll.""" - try: - # Initial scrape with infinite scroll enabled - response = await client.smartscraper( - website_url=url, - user_prompt="Extract all company information from this page, including name, description, and website", - number_of_scrolls=10, - ) - # Process the results - companies = response.get("result", {}).get("companies", []) - if not companies: - print(f"No companies found for batch {batch}") - return - - # Save or process the companies data - print(f"Found {len(companies)} companies in batch {batch}") - - for company in companies: - print(f"Company: {company.get('name', 'N/A')}") - print(f"Description: {company.get('description', 'N/A')}") - print(f"Website: {company.get('website', 'N/A')}") - print("-" * 50) - - except Exception as e: - print(f"Error scraping batch {batch}: {str(e)}") - - -async def main(): - # Initialize async client - client = AsyncClient(api_key="Your-API-Key") - - try: - # Example YC batch URLs - batch_urls = { - "W24": "https://www.ycombinator.com/companies?batch=Winter%202024", - "S23": "https://www.ycombinator.com/companies?batch=Summer%202023", - } - - # Create tasks for each batch - tasks = [ - scrape_companies(client, url, batch) for batch, url in batch_urls.items() - ] - - # Execute all batch scraping concurrently - await asyncio.gather(*tasks) - - finally: - # Ensure client is properly closed - await client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_pagination_example.py b/scrapegraph-py/examples/smartscraper/async/async_smartscraper_pagination_example.py deleted file mode 100644 index 4ef32d4..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_pagination_example.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 -""" -SmartScraper Pagination Example (Async) - -This example demonstrates how to use pagination functionality with SmartScraper API using the asynchronous client. -""" - -import asyncio -import json -import logging -import os -import time -from typing import List, Optional - -from dotenv import load_dotenv -from pydantic import BaseModel - -from scrapegraph_py import AsyncClient -from scrapegraph_py.exceptions import APIError - -# Load environment variables from .env file -load_dotenv() - - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) -logger = logging.getLogger(__name__) - - -class ProductInfo(BaseModel): - """Schema for product information""" - - name: str - price: Optional[str] = None - rating: Optional[str] = None - image_url: Optional[str] = None - description: Optional[str] = None - - -class ProductList(BaseModel): - """Schema for list of products""" - - products: List[ProductInfo] - - -async def smartscraper_pagination_example(): - """Example of using pagination with SmartScraper (async)""" - - print("SmartScraper Pagination Example (Async)") - print("=" * 50) - - # Initialize client from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Configuration - website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2" - user_prompt = "Extract all product info including name, price, rating, image_url, and description" - total_pages = 3 # Number of pages to scrape - - print(f"🌐 Website URL: {website_url}") - print(f"📝 User Prompt: {user_prompt}") - print(f"📄 Total Pages: {total_pages}") - print("-" * 50) - - try: - # Start timing - start_time = time.time() - - # Make the request with pagination - result = await client.smartscraper( - user_prompt=user_prompt, - website_url=website_url, - output_schema=ProductList, - total_pages=total_pages, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print(f"📊 Response type: {type(result)}") - - # Display results - if isinstance(result, dict): - print("\n🔍 Response:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - # Check for pagination success indicators - if "data" in result: - print( - f"\n✨ Pagination successful! Data extracted from {total_pages} pages" - ) - - elif isinstance(result, list): - print(f"\n✅ Pagination successful! Extracted {len(result)} items") - for i, item in enumerate(result[:5]): # Show first 5 items - print(f" {i+1}. {item}") - if len(result) > 5: - print(f" ... and {len(result) - 5} more items") - else: - print(f"\n📋 Result: {result}") - - except APIError as e: - print(f"❌ API Error: {e}") - print("This could be due to:") - print(" - Invalid API key") - print(" - Rate limiting") - print(" - Server issues") - - except Exception as e: - print(f"❌ Unexpected error: {e}") - print("This could be due to:") - print(" - Network connectivity issues") - print(" - Invalid website URL") - print(" - Pagination limitations") - - -async def test_concurrent_pagination(): - """Test multiple pagination requests concurrently""" - - print("\n" + "=" * 50) - print("Testing concurrent pagination requests") - print("=" * 50) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Test concurrent requests - urls = [ - "https://example.com/products?page=1", - "https://example.com/products?page=2", - "https://example.com/products?page=3", - ] - - tasks = [] - for i, url in enumerate(urls): - print(f"🚀 Creating task {i+1} for URL: {url}") - # Note: In a real scenario, you would use actual URLs - # This is just to demonstrate the async functionality - tasks.append( - asyncio.create_task(simulate_pagination_request(client, url, i + 1)) - ) - - print(f"⏱️ Starting {len(tasks)} concurrent tasks...") - start_time = time.time() - - try: - results = await asyncio.gather(*tasks, return_exceptions=True) - duration = time.time() - start_time - - print(f"✅ All tasks completed in {duration:.2f} seconds") - - for i, result in enumerate(results): - if isinstance(result, Exception): - print(f"❌ Task {i+1} failed: {result}") - else: - print(f"✅ Task {i+1} succeeded: {result}") - - except Exception as e: - print(f"❌ Concurrent execution failed: {e}") - - -async def simulate_pagination_request(client: AsyncClient, url: str, task_id: int): - """Simulate a pagination request (for demonstration)""" - - print(f"📋 Task {task_id}: Processing {url}") - - # Simulate some work - await asyncio.sleep(0.5) - - # Return a simulated result - return f"Task {task_id} completed successfully" - - -async def test_pagination_with_different_parameters(): - """Test pagination with different parameters""" - - print("\n" + "=" * 50) - print("Testing pagination with different parameters") - print("=" * 50) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Test cases - test_cases = [ - { - "name": "Single page (default)", - "url": "https://example.com", - "total_pages": None, - "user_prompt": "Extract basic info", - }, - { - "name": "Two pages with schema", - "url": "https://example.com/products", - "total_pages": 2, - "user_prompt": "Extract product information", - "output_schema": ProductList, - }, - { - "name": "Maximum pages with scrolling", - "url": "https://example.com/search", - "total_pages": 5, - "user_prompt": "Extract all available data", - "number_of_scrolls": 3, - }, - ] - - for test_case in test_cases: - print(f"\n🧪 Test: {test_case['name']}") - print(f" Pages: {test_case['total_pages']}") - print(f" Prompt: {test_case['user_prompt']}") - - try: - # This is just to demonstrate the API call structure - # In a real scenario, you'd make actual API calls - print(" ✅ Configuration valid") - - except Exception as e: - print(f" ❌ Configuration error: {e}") - - -async def main(): - """Main function to run the pagination examples""" - - print("ScrapeGraph SDK - SmartScraper Pagination Examples (Async)") - print("=" * 60) - - # Run the main example - await smartscraper_pagination_example() - - # Test concurrent pagination - await test_concurrent_pagination() - - # Test different parameters - await test_pagination_with_different_parameters() - - print("\n" + "=" * 60) - print("Examples completed!") - print("\nNext steps:") - print("1. Set SGAI_API_KEY environment variable") - print("2. Replace example URLs with real websites") - print("3. Adjust total_pages parameter (1-10)") - print("4. Customize user_prompt for your use case") - print("5. Define output_schema for structured data") - print("\nAsync-specific tips:") - print("- Use asyncio.gather() for concurrent requests") - print("- Consider rate limiting with asyncio.Semaphore") - print("- Handle exceptions properly in async context") - print("- Use proper context managers for cleanup") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_render_heavy_example.py b/scrapegraph-py/examples/smartscraper/async/async_smartscraper_render_heavy_example.py deleted file mode 100644 index 90f2260..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_render_heavy_example.py +++ /dev/null @@ -1,39 +0,0 @@ -import asyncio -import os - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -async def main(): - # Initialize the client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - async with AsyncClient(api_key=api_key) as sgai_client: - # SmartScraper request with render_heavy_js enabled - response = await sgai_client.smartscraper( - website_url="https://example.com", - user_prompt="Find the CEO of company X and their contact details", - render_heavy_js=True, # Enable heavy JavaScript rendering - ) - - # Print the response - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_schema_example.py b/scrapegraph-py/examples/smartscraper/async/async_smartscraper_schema_example.py deleted file mode 100644 index d7cd4fa..0000000 --- a/scrapegraph-py/examples/smartscraper/async/async_smartscraper_schema_example.py +++ /dev/null @@ -1,34 +0,0 @@ -import asyncio - -from pydantic import BaseModel, Field - -from scrapegraph_py import AsyncClient - - -# Define a Pydantic model for the output schema -class WebpageSchema(BaseModel): - title: str = Field(description="The title of the webpage") - description: str = Field(description="The description of the webpage") - summary: str = Field(description="A brief summary of the webpage") - - -async def main(): - # Initialize the async client - sgai_client = AsyncClient(api_key="your-api-key-here") - - # SmartScraper request with output schema - response = await sgai_client.smartscraper( - website_url="https://example.com", - user_prompt="Extract webpage information", - output_schema=WebpageSchema, - ) - - # Print the response - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - - await sgai_client.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scrapegraph-py/examples/smartscraper/sync/generate_schema_example.py b/scrapegraph-py/examples/smartscraper/sync/generate_schema_example.py deleted file mode 100644 index 205e579..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/generate_schema_example.py +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env python3 -""" -Example script demonstrating the Generate Schema API endpoint using ScrapeGraph Python SDK. - -This script shows how to: -1. Generate a new JSON schema from a search query -2. Modify an existing schema -3. Handle different types of search queries -4. Check the status of schema generation requests - -Requirements: -- Python 3.7+ -- scrapegraph-py package -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here - -Usage: - python generate_schema_example.py -""" - -import json -import os -from typing import Any, Dict, Optional - -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -class GenerateSchemaExample: - """Example class for demonstrating the Generate Schema API using ScrapeGraph SDK""" - - def __init__(self, base_url: str = None, api_key: str = None): - # Get API key from environment if not provided - self.api_key = api_key or os.getenv("SGAI_API_KEY") - if not self.api_key: - raise ValueError( - "API key must be provided or set in .env file as SGAI_API_KEY. " - "Create a .env file with: SGAI_API_KEY=your_api_key_here" - ) - - # Initialize the ScrapeGraph client - if base_url: - # If base_url is provided, we'll need to modify the client to use it - # For now, we'll use the default client and note the limitation - print(f"⚠️ Note: Custom base_url {base_url} not yet supported in this example") - - self.client = Client(api_key=self.api_key) - - def print_schema_response( - self, response: Dict[str, Any], title: str = "Schema Generation Response" - ): - """Pretty print the schema generation response""" - print(f"\n{'='*60}") - print(f" {title}") - print(f"{'='*60}") - - if "error" in response and response["error"]: - print(f"❌ Error: {response['error']}") - return - - print(f"✅ Request ID: {response.get('request_id', 'N/A')}") - print(f"📊 Status: {response.get('status', 'N/A')}") - print(f"🔍 User Prompt: {response.get('user_prompt', 'N/A')}") - print(f"✨ Refined Prompt: {response.get('refined_prompt', 'N/A')}") - - if "generated_schema" in response: - print(f"\n📋 Generated Schema:") - print(json.dumps(response["generated_schema"], indent=2)) - - def run_examples(self): - """Run all the example scenarios""" - print("🚀 Generate Schema API Examples using ScrapeGraph Python SDK") - print("=" * 60) - - # Example 1: Generate schema for e-commerce products - print("\n1️⃣ Example: E-commerce Product Search") - ecommerce_prompt = "Find laptops with specifications like brand, processor, RAM, storage, and price" - try: - response = self.client.generate_schema(ecommerce_prompt) - self.print_schema_response(response, "E-commerce Products Schema") - except Exception as e: - print(f"❌ Error in e-commerce example: {e}") - - # Example 2: Generate schema for job listings - print("\n2️⃣ Example: Job Listings Search") - job_prompt = "Search for software engineering jobs with company name, position, location, salary range, and requirements" - try: - response = self.client.generate_schema(job_prompt) - self.print_schema_response(response, "Job Listings Schema") - except Exception as e: - print(f"❌ Error in job listings example: {e}") - - # Example 3: Generate schema for news articles - print("\n3️⃣ Example: News Articles Search") - news_prompt = "Find technology news articles with headline, author, publication date, category, and summary" - try: - response = self.client.generate_schema(news_prompt) - self.print_schema_response(response, "News Articles Schema") - except Exception as e: - print(f"❌ Error in news articles example: {e}") - - # Example 4: Modify existing schema - print("\n4️⃣ Example: Modify Existing Schema") - existing_schema = { - "$defs": { - "ProductSchema": { - "title": "ProductSchema", - "type": "object", - "properties": { - "name": {"title": "Name", "type": "string"}, - "price": {"title": "Price", "type": "number"}, - }, - "required": ["name", "price"], - } - }, - "title": "ProductList", - "type": "object", - "properties": { - "products": { - "title": "Products", - "type": "array", - "items": {"$ref": "#/$defs/ProductSchema"}, - } - }, - "required": ["products"], - } - - modification_prompt = ( - "Add brand, category, and rating fields to the existing product schema" - ) - try: - response = self.client.generate_schema(modification_prompt, existing_schema) - self.print_schema_response(response, "Modified Product Schema") - except Exception as e: - print(f"❌ Error in schema modification example: {e}") - - # Example 5: Complex nested schema - print("\n5️⃣ Example: Complex Nested Schema") - complex_prompt = "Create a schema for a company directory with departments, each containing employees with contact info and projects" - try: - response = self.client.generate_schema(complex_prompt) - self.print_schema_response(response, "Company Directory Schema") - except Exception as e: - print(f"❌ Error in complex schema example: {e}") - - def demonstrate_status_checking(self): - """Demonstrate how to check the status of schema generation requests""" - print("\n🔄 Demonstrating Status Checking...") - - # Generate a simple schema first - prompt = "Find restaurants with name, cuisine, rating, and address" - try: - response = self.client.generate_schema(prompt) - request_id = response.get('request_id') - - if request_id: - print(f"📝 Generated schema request with ID: {request_id}") - - # Check the status - print("🔍 Checking status...") - status_response = self.client.get_schema_status(request_id) - self.print_schema_response(status_response, f"Status Check for {request_id}") - else: - print("⚠️ No request ID returned from schema generation") - - except Exception as e: - print(f"❌ Error in status checking demonstration: {e}") - - def close(self): - """Close the client to free up resources""" - if hasattr(self, 'client'): - self.client.close() - - -def main(): - """Main function to run the examples""" - # Check if API key is available - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - # Initialize the example class - example = GenerateSchemaExample() - - try: - # Run synchronous examples - example.run_examples() - - # Demonstrate status checking - example.demonstrate_status_checking() - - except Exception as e: - print(f"❌ Unexpected Error: {e}") - finally: - # Always close the client - example.close() - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/smartscraper/sync/sample_product.html b/scrapegraph-py/examples/smartscraper/sync/sample_product.html deleted file mode 100644 index 2872e1a..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/sample_product.html +++ /dev/null @@ -1,77 +0,0 @@ - - - - - - Sample Product Page - - - -
-

Premium Wireless Headphones

-

€299.99

- -
-

Description

-

- Experience crystal-clear audio with our premium wireless headphones. - Featuring advanced noise cancellation technology and up to 30 hours - of battery life, these headphones are perfect for music lovers and - professionals alike. -

-
- -
-

Key Features

-
    -
  • Active Noise Cancellation (ANC)
  • -
  • 30-hour battery life
  • -
  • Bluetooth 5.0 connectivity
  • -
  • Premium leather ear cushions
  • -
  • Foldable design with carry case
  • -
  • Built-in microphone for calls
  • -
-
- -
-

Contact Information

-

Email: support@example.com

-

Phone: +1 (555) 123-4567

-

Website: www.example.com

-
- -
-

Stock Status: In Stock

-

SKU: WH-1000XM5-BLK

-

Category: Electronics > Audio > Headphones

-
-
- - \ No newline at end of file diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_cookies_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_cookies_example.py deleted file mode 100644 index cc81235..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_cookies_example.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -Example demonstrating how to use the SmartScraper API with cookies. - -This example shows how to: -1. Set up the API request with cookies for authentication -2. Use cookies with infinite scrolling -3. Define a Pydantic model for structured output -4. Make the API call and handle the response -5. Process the extracted data - -Requirements: -- Python 3.7+ -- scrapegraph-py -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import json -import os -from typing import Dict - -from dotenv import load_dotenv -from pydantic import BaseModel, Field - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -# Define the data models for structured output -class CookieInfo(BaseModel): - """Model representing cookie information.""" - - cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") - - -def main(): - """Example usage of the cookies scraper.""" - # Check if API key is available - if not os.getenv("SGAI_API_KEY"): - print("Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - # Initialize the client - client = Client.from_env() - - # Example 1: Basic cookies example (httpbin.org/cookies) - print("=" * 60) - print("EXAMPLE 1: Basic Cookies Example") - print("=" * 60) - - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies info" - cookies = {"cookies_key": "cookies_value"} - - try: - # Perform the scraping with cookies - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - output_schema=CookieInfo, - ) - - # Print the results - print("\nExtracted Cookie Information:") - print(json.dumps(response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - # Example 2: Cookies with infinite scrolling - print("\n" + "=" * 60) - print("EXAMPLE 2: Cookies with Infinite Scrolling") - print("=" * 60) - - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies and scroll information" - cookies = {"session_id": "abc123", "user_token": "xyz789"} - - try: - # Perform the scraping with cookies and infinite scrolling - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - number_of_scrolls=3, - output_schema=CookieInfo, - ) - - # Print the results - print("\nExtracted Cookie Information with Scrolling:") - print(json.dumps(response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - # Example 3: Cookies with pagination - print("\n" + "=" * 60) - print("EXAMPLE 3: Cookies with Pagination") - print("=" * 60) - - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies from multiple pages" - cookies = {"auth_token": "secret123", "preferences": "dark_mode"} - - try: - # Perform the scraping with cookies and pagination - response = client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - cookies=cookies, - total_pages=3, - output_schema=CookieInfo, - ) - - # Print the results - print("\nExtracted Cookie Information with Pagination:") - print(json.dumps(response, indent=2)) - - except Exception as e: - print(f"Error occurred: {str(e)}") - - # Close the client - client.close() - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_example.py deleted file mode 100644 index f6b8206..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_example.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - -# Initialize the client with API key from environment variable -api_key = os.getenv("SGAI_API_KEY") -if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - exit(1) - -sgai_client = Client(api_key=api_key) - -# SmartScraper request -response = sgai_client.smartscraper( - website_url="https://example.com", - # website_html="...", # Optional, if you want to pass in HTML content instead of a URL - user_prompt="Extract the main heading, description, and summary of the webpage", -) - - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") - -sgai_client.close() diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_infinite_scroll_example.py deleted file mode 100644 index ece6579..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_infinite_scroll_example.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -from typing import List - -from dotenv import load_dotenv -from pydantic import BaseModel - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -# Define the output schema -class Company(BaseModel): - name: str - category: str - location: str - - -class CompaniesResponse(BaseModel): - companies: List[Company] - - -# Initialize the client with API key from environment variable -# Make sure to set SGAI_API_KEY in your environment or .env file -api_key = os.getenv("SGAI_API_KEY") -if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - exit(1) - -sgai_client = Client(api_key=api_key) - -try: - # SmartScraper request with infinite scroll - response = sgai_client.smartscraper( - website_url="https://www.ycombinator.com/companies?batch=Spring%202025", - user_prompt="Extract all company names and their categories from the page", - output_schema=CompaniesResponse, - number_of_scrolls=10, # Scroll 10 times to load more companies - ) - - # Print the response - print(f"Request ID: {response['request_id']}") - - # Parse and print the results in a structured way - result = CompaniesResponse.model_validate(response["result"]) - print("\nExtracted Companies:") - print("-" * 80) - for company in result.companies: - print(f"Name: {company.name}") - print(f"Category: {company.category}") - print(f"Location: {company.location}") - print("-" * 80) - -except Exception as e: - print(f"An error occurred: {e}") - -finally: - sgai_client.close() diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_local_html_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_local_html_example.py deleted file mode 100644 index c79cab1..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_local_html_example.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -SmartScraper with Local HTML File Example - -This example demonstrates how to use SmartScraper with a local HTML file -instead of fetching content from a URL. Perfect for: -- Testing with static HTML files -- Processing saved web pages -- Working offline -- Debugging and development - -Requirements: -- SGAI_API_KEY environment variable must be set -""" - -import os -from pathlib import Path - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - - -def read_html_file(file_path: str) -> str: - """ - Read HTML content from a local file. - - Args: - file_path: Path to the HTML file - - Returns: - HTML content as string - """ - try: - with open(file_path, "r", encoding="utf-8") as f: - return f.read() - except FileNotFoundError: - print(f"❌ File not found: {file_path}") - raise - except Exception as e: - print(f"❌ Error reading file: {str(e)}") - raise - - -def main(): - """Extract data from a local HTML file using SmartScraper.""" - - # Initialize the client with API key from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - # Path to the sample HTML file in the same directory - script_dir = Path(__file__).parent - html_file_path = script_dir / "sample_product.html" - - # Check if the HTML file exists - if not html_file_path.exists(): - print(f"❌ HTML file not found at: {html_file_path}") - print(" Make sure sample_product.html exists in the sync/ directory") - return - - # Read the HTML file - print(f"📂 Reading HTML file: {html_file_path.name}") - html_content = read_html_file(str(html_file_path)) - - # Check file size (max 2MB) - html_size_mb = len(html_content.encode("utf-8")) / (1024 * 1024) - print(f"📊 HTML file size: {html_size_mb:.4f} MB") - - if html_size_mb > 2: - print("❌ HTML file exceeds 2MB limit") - return - - # Define what to extract - user_prompt = "Extract the product name, price, description, all features, and contact information" - - # Create client and scrape using local HTML - sgai_client = Client(api_key=api_key) - - print(f"🎯 Prompt: {user_prompt}") - print() - - # Pass website_html instead of website_url - # Note: website_url should be empty string when using website_html - response = sgai_client.smartscraper( - website_url="", # Empty when using website_html - user_prompt=user_prompt, - website_html=html_content, # Pass the HTML content here - ) - - # Print the response - print("✅ Success! Extracted data from local HTML:") - print() - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - print() - - sgai_client.close() - - -if __name__ == "__main__": - print("SmartScraper with Local HTML File Example") - print("=" * 45) - print() - main() diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_pagination_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_pagination_example.py deleted file mode 100644 index 08d76aa..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_pagination_example.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python3 -""" -SmartScraper Pagination Example (Sync) - -This example demonstrates how to use pagination functionality with SmartScraper API using the synchronous client. -""" - -import json -import logging -import os -import time -from typing import List, Optional - -from dotenv import load_dotenv -from pydantic import BaseModel - -from scrapegraph_py import Client -from scrapegraph_py.exceptions import APIError - -# Load environment variables from .env file -load_dotenv() - - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) -logger = logging.getLogger(__name__) - - -class ProductInfo(BaseModel): - """Schema for product information""" - - name: str - price: Optional[str] = None - rating: Optional[str] = None - image_url: Optional[str] = None - description: Optional[str] = None - - -class ProductList(BaseModel): - """Schema for list of products""" - - products: List[ProductInfo] - - -def smartscraper_pagination_example(): - """Example of using pagination with SmartScraper (sync)""" - - print("SmartScraper Pagination Example (Sync)") - print("=" * 50) - - # Initialize client from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - try: - client = Client(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Configuration - website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2" - user_prompt = "Extract all product info including name, price, rating, image_url, and description" - total_pages = 3 # Number of pages to scrape - - print(f"🌐 Website URL: {website_url}") - print(f"📝 User Prompt: {user_prompt}") - print(f"📄 Total Pages: {total_pages}") - print("-" * 50) - - try: - # Start timing - start_time = time.time() - - # Make the request with pagination - result = client.smartscraper( - user_prompt=user_prompt, - website_url=website_url, - output_schema=ProductList, - total_pages=total_pages, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print(f"📊 Response type: {type(result)}") - - # Display results - if isinstance(result, dict): - print("\n🔍 Response:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - # Check for pagination success indicators - if "data" in result: - print( - f"\n✨ Pagination successful! Data extracted from {total_pages} pages" - ) - - elif isinstance(result, list): - print(f"\n✅ Pagination successful! Extracted {len(result)} items") - for i, item in enumerate(result[:5]): # Show first 5 items - print(f" {i+1}. {item}") - if len(result) > 5: - print(f" ... and {len(result) - 5} more items") - else: - print(f"\n📋 Result: {result}") - - except APIError as e: - print(f"❌ API Error: {e}") - print("This could be due to:") - print(" - Invalid API key") - print(" - Rate limiting") - print(" - Server issues") - - except Exception as e: - print(f"❌ Unexpected error: {e}") - print("This could be due to:") - print(" - Network connectivity issues") - print(" - Invalid website URL") - print(" - Pagination limitations") - - -def test_pagination_parameters(): - """Test different pagination parameters""" - - print("\n" + "=" * 50) - print("Testing different pagination parameters") - print("=" * 50) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - Client(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Test cases - test_cases = [ - { - "name": "Single page (default)", - "url": "https://example.com", - "total_pages": None, - }, - {"name": "Two pages", "url": "https://example.com/products", "total_pages": 2}, - { - "name": "Maximum pages", - "url": "https://example.com/search", - "total_pages": 10, - }, - ] - - for test_case in test_cases: - print(f"\n🧪 Test: {test_case['name']}") - print(f" Pages: {test_case['total_pages']}") - - try: - # This is just to demonstrate the API call structure - # In a real scenario, you'd use actual URLs - print(" ✅ Configuration valid") - - except Exception as e: - print(f" ❌ Configuration error: {e}") - - -def main(): - """Main function to run the pagination examples""" - - print("ScrapeGraph SDK - SmartScraper Pagination Examples") - print("=" * 60) - - # Run the main example - smartscraper_pagination_example() - - # Test different parameters - test_pagination_parameters() - - print("\n" + "=" * 60) - print("Examples completed!") - print("\nNext steps:") - print("1. Set SGAI_API_KEY environment variable") - print("2. Replace example URLs with real websites") - print("3. Adjust total_pages parameter (1-10)") - print("4. Customize user_prompt for your use case") - print("5. Define output_schema for structured data") - print("\nTips:") - print("- Use smaller total_pages for testing") - print("- Pagination requests may take longer") - print("- Some websites may not support pagination") - print("- Consider rate limiting for large requests") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_render_heavy_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_render_heavy_example.py deleted file mode 100644 index 1a95699..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_render_heavy_example.py +++ /dev/null @@ -1,35 +0,0 @@ -import os - -from dotenv import load_dotenv - -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Load environment variables from .env file -load_dotenv() - -sgai_logger.set_logging(level="INFO") - -# Initialize the client with API key from environment variable -api_key = os.getenv("SGAI_API_KEY") -if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - exit(1) - -sgai_client = Client(api_key=api_key) - -# SmartScraper request with render_heavy_js enabled -response = sgai_client.smartscraper( - website_url="https://example.com", - user_prompt="Find the CEO of company X and their contact details", - render_heavy_js=True, # Enable heavy JavaScript rendering -) - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") - -sgai_client.close() \ No newline at end of file diff --git a/scrapegraph-py/examples/smartscraper/sync/smartscraper_schema_example.py b/scrapegraph-py/examples/smartscraper/sync/smartscraper_schema_example.py deleted file mode 100644 index e2c7c04..0000000 --- a/scrapegraph-py/examples/smartscraper/sync/smartscraper_schema_example.py +++ /dev/null @@ -1,42 +0,0 @@ -import os - -from dotenv import load_dotenv -from pydantic import BaseModel, Field - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -# Define a Pydantic model for the output schema -class WebpageSchema(BaseModel): - title: str = Field(description="The title of the webpage") - description: str = Field(description="The description of the webpage") - summary: str = Field(description="A brief summary of the webpage") - - -# Initialize the client with API key from environment variable -api_key = os.getenv("SGAI_API_KEY") -if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - exit(1) - -sgai_client = Client(api_key=api_key) - -# SmartScraper request with output schema -response = sgai_client.smartscraper( - website_url="https://example.com", - # website_html="...", # Optional, if you want to pass in HTML content instead of a URL - user_prompt="Extract webpage information", - output_schema=WebpageSchema, -) - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") - -sgai_client.close() diff --git a/scrapegraph-py/examples/stealth_mode_example.py b/scrapegraph-py/examples/stealth_mode_example.py deleted file mode 100644 index 442c3a3..0000000 --- a/scrapegraph-py/examples/stealth_mode_example.py +++ /dev/null @@ -1,494 +0,0 @@ -""" -Stealth Mode Examples for ScrapeGraph AI Python SDK - -This file demonstrates how to use stealth mode with various endpoints -to avoid bot detection when scraping websites. - -Stealth mode enables advanced techniques to make requests appear more -like those from a real browser, helping to bypass basic bot detection. -""" - -import os -from scrapegraph_py import Client -from pydantic import BaseModel, Field - -# Get API key from environment variable -API_KEY = os.getenv("SGAI_API_KEY", "your-api-key-here") - - -# ============================================================================ -# EXAMPLE 1: SmartScraper with Stealth Mode -# ============================================================================ - - -def example_smartscraper_with_stealth(): - """ - Extract structured data from a webpage using stealth mode. - Useful for websites with bot detection. - """ - print("\n" + "=" * 60) - print("EXAMPLE 1: SmartScraper with Stealth Mode") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.smartscraper( - website_url="https://www.scrapethissite.com/pages/simple/", - user_prompt="Extract country names and capitals", - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 2: SmartScraper with Stealth Mode and Output Schema -# ============================================================================ - - -def example_smartscraper_with_stealth_and_schema(): - """ - Use stealth mode with a structured output schema to extract data - from websites that might detect bots. - """ - print("\n" + "=" * 60) - print("EXAMPLE 2: SmartScraper with Stealth Mode and Schema") - print("=" * 60) - - # Define output schema using Pydantic - class Product(BaseModel): - name: str = Field(description="Product name") - price: str = Field(description="Product price") - rating: float = Field(description="Product rating (0-5)") - - with Client(api_key=API_KEY) as client: - try: - response = client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information including name, price, and rating", - output_schema=Product, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 3: SearchScraper with Stealth Mode -# ============================================================================ - - -def example_searchscraper_with_stealth(): - """ - Search and extract information from multiple sources using stealth mode. - """ - print("\n" + "=" * 60) - print("EXAMPLE 3: SearchScraper with Stealth Mode") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.searchscraper( - user_prompt="What are the latest developments in AI technology?", - num_results=5, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - if "reference_urls" in response: - print(f"Reference URLs: {response['reference_urls']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 4: Markdownify with Stealth Mode -# ============================================================================ - - -def example_markdownify_with_stealth(): - """ - Convert a webpage to markdown format using stealth mode. - """ - print("\n" + "=" * 60) - print("EXAMPLE 4: Markdownify with Stealth Mode") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.markdownify( - website_url="https://www.example.com", - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - print(f"Markdown Preview (first 500 chars):") - print(response["result"][:500]) - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 5: Scrape with Stealth Mode -# ============================================================================ - - -def example_scrape_with_stealth(): - """ - Get raw HTML from a webpage using stealth mode. - """ - print("\n" + "=" * 60) - print("EXAMPLE 5: Scrape with Stealth Mode") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.scrape( - website_url="https://www.example.com", - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Scrape Request ID: {response['scrape_request_id']}") - print(f"HTML Preview (first 500 chars):") - print(response["html"][:500]) - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 6: Scrape with Stealth Mode and Heavy JS Rendering -# ============================================================================ - - -def example_scrape_with_stealth_and_js(): - """ - Scrape a JavaScript-heavy website using stealth mode. - Combines JavaScript rendering with stealth techniques. - """ - print("\n" + "=" * 60) - print("EXAMPLE 6: Scrape with Stealth Mode and Heavy JS") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.scrape( - website_url="https://www.example.com", - render_heavy_js=True, # Enable JavaScript rendering - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Scrape Request ID: {response['scrape_request_id']}") - print(f"HTML Preview (first 500 chars):") - print(response["html"][:500]) - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 7: Agentic Scraper with Stealth Mode -# ============================================================================ - - -def example_agenticscraper_with_stealth(): - """ - Perform automated browser actions using stealth mode. - Ideal for interacting with protected forms or multi-step workflows. - """ - print("\n" + "=" * 60) - print("EXAMPLE 7: Agentic Scraper with Stealth Mode") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.agenticscraper( - url="https://dashboard.example.com/login", - steps=[ - "Type user@example.com in email input box", - "Type password123 in password input box", - "Click on login button", - ], - use_session=True, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 8: Agentic Scraper with Stealth Mode and AI Extraction -# ============================================================================ - - -def example_agenticscraper_with_stealth_and_ai(): - """ - Combine stealth mode with AI extraction in agentic scraping. - Performs actions and then extracts structured data. - """ - print("\n" + "=" * 60) - print("EXAMPLE 8: Agentic Scraper with Stealth and AI Extraction") - print("=" * 60) - - with Client(api_key=API_KEY) as client: - try: - response = client.agenticscraper( - url="https://dashboard.example.com", - steps=[ - "Navigate to user profile section", - "Click on settings tab", - ], - use_session=True, - user_prompt="Extract user profile information and settings", - ai_extraction=True, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 9: Crawl with Stealth Mode -# ============================================================================ - - -def example_crawl_with_stealth(): - """ - Crawl an entire website using stealth mode. - Useful for comprehensive data extraction from protected sites. - """ - print("\n" + "=" * 60) - print("EXAMPLE 9: Crawl with Stealth Mode") - print("=" * 60) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Website Content", - "type": "object", - "properties": { - "title": {"type": "string", "description": "Page title"}, - "content": {"type": "string", "description": "Main content"}, - }, - "required": ["title"], - } - - with Client(api_key=API_KEY) as client: - try: - response = client.crawl( - url="https://www.example.com", - prompt="Extract page titles and main content", - data_schema=schema, - depth=2, - max_pages=5, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Crawl ID: {response['id']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 10: Crawl with Stealth Mode and Sitemap -# ============================================================================ - - -def example_crawl_with_stealth_and_sitemap(): - """ - Use sitemap for efficient crawling with stealth mode enabled. - """ - print("\n" + "=" * 60) - print("EXAMPLE 10: Crawl with Stealth Mode and Sitemap") - print("=" * 60) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Product Information", - "type": "object", - "properties": { - "product_name": {"type": "string"}, - "price": {"type": "string"}, - "description": {"type": "string"}, - }, - "required": ["product_name"], - } - - with Client(api_key=API_KEY) as client: - try: - response = client.crawl( - url="https://www.example-shop.com", - prompt="Extract product information from all pages", - data_schema=schema, - sitemap=True, # Use sitemap for better page discovery - depth=3, - max_pages=10, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Crawl ID: {response['id']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 11: SmartScraper with Stealth, Custom Headers, and Pagination -# ============================================================================ - - -def example_smartscraper_advanced_stealth(): - """ - Advanced example combining stealth mode with custom headers and pagination. - """ - print("\n" + "=" * 60) - print("EXAMPLE 11: SmartScraper Advanced with Stealth") - print("=" * 60) - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Accept-Language": "en-US,en;q=0.9", - } - - with Client(api_key=API_KEY) as client: - try: - response = client.smartscraper( - website_url="https://www.example-marketplace.com/products", - user_prompt="Extract all product listings from multiple pages", - headers=headers, - number_of_scrolls=10, - total_pages=5, - render_heavy_js=True, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# EXAMPLE 12: Using Stealth Mode with Custom Headers -# ============================================================================ - - -def example_stealth_with_custom_headers(): - """ - Demonstrate using stealth mode together with custom headers - for maximum control over request appearance. - """ - print("\n" + "=" * 60) - print("EXAMPLE 12: Stealth Mode with Custom Headers") - print("=" * 60) - - # Custom headers to simulate a real browser request - headers = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "DNT": "1", - } - - with Client(api_key=API_KEY) as client: - try: - # Using with markdownify - response = client.markdownify( - website_url="https://www.protected-site.com", - headers=headers, - stealth=True, # Enable stealth mode - ) - - print(f"Status: {response['status']}") - print(f"Request ID: {response['request_id']}") - print(f"Success! Stealth mode + custom headers bypassed detection.") - - except Exception as e: - print(f"Error: {e}") - - -# ============================================================================ -# RUN ALL EXAMPLES -# ============================================================================ - - -def run_all_examples(): - """Run all stealth mode examples""" - print("\n") - print("=" * 60) - print("STEALTH MODE EXAMPLES FOR SCRAPEGRAPH AI PYTHON SDK") - print("=" * 60) - print("\nThese examples demonstrate how to use stealth mode") - print("to avoid bot detection when scraping websites.") - print("\nStealth mode is available for all major endpoints:") - print("- SmartScraper") - print("- SearchScraper") - print("- Markdownify") - print("- Scrape") - print("- Agentic Scraper") - print("- Crawl") - - examples = [ - example_smartscraper_with_stealth, - example_smartscraper_with_stealth_and_schema, - example_searchscraper_with_stealth, - example_markdownify_with_stealth, - example_scrape_with_stealth, - example_scrape_with_stealth_and_js, - example_agenticscraper_with_stealth, - example_agenticscraper_with_stealth_and_ai, - example_crawl_with_stealth, - example_crawl_with_stealth_and_sitemap, - example_smartscraper_advanced_stealth, - example_stealth_with_custom_headers, - ] - - for i, example_func in enumerate(examples, 1): - try: - example_func() - except Exception as e: - print(f"\nExample {i} failed: {e}") - - print("\n" + "=" * 60) - print("ALL EXAMPLES COMPLETED") - print("=" * 60) - - -if __name__ == "__main__": - # You can run all examples or specific ones - run_all_examples() - - # Or run individual examples: - # example_smartscraper_with_stealth() - # example_searchscraper_with_stealth() - # example_crawl_with_stealth() diff --git a/scrapegraph-py/examples/steps/step_by_step_schema_generation.py b/scrapegraph-py/examples/steps/step_by_step_schema_generation.py deleted file mode 100644 index ff91a75..0000000 --- a/scrapegraph-py/examples/steps/step_by_step_schema_generation.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 -""" -Step-by-step example for schema generation using ScrapeGraph Python SDK. - -This script demonstrates the basic workflow for schema generation: -1. Initialize the client -2. Generate a schema from a prompt -3. Check the status of the request -4. Retrieve the final result - -Requirements: -- Python 3.7+ -- scrapegraph-py package -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here - -Usage: - python step_by_step_schema_generation.py -""" - -import json -import os -import time -from typing import Any, Dict - -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def print_step(step_number: int, title: str, description: str = ""): - """Print a formatted step header""" - print(f"\n{'='*60}") - print(f"STEP {step_number}: {title}") - print(f"{'='*60}") - if description: - print(description) - print() - - -def print_response(response: Dict[str, Any], title: str = "API Response"): - """Pretty print an API response""" - print(f"\n📋 {title}") - print("-" * 40) - - if "error" in response and response["error"]: - print(f"❌ Error: {response['error']}") - return - - for key, value in response.items(): - if key == "generated_schema" and value: - print(f"🔧 {key}:") - print(json.dumps(value, indent=2)) - else: - print(f"🔧 {key}: {value}") - - -def main(): - """Main function demonstrating step-by-step schema generation""" - - # Step 1: Check API key and initialize client - print_step(1, "Initialize Client", "Setting up the ScrapeGraph client with your API key") - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY not found in .env file") - print("Please create a .env file with your API key:") - print("SGAI_API_KEY=your_api_key_here") - return - - try: - client = Client(api_key=api_key) - print("✅ Client initialized successfully") - except Exception as e: - print(f"❌ Failed to initialize client: {e}") - return - - # Step 2: Define the schema generation request - print_step(2, "Define Request", "Creating a prompt for schema generation") - - user_prompt = "Find laptops with specifications like brand, processor, RAM, storage, and price" - print(f"💭 User Prompt: {user_prompt}") - - # Step 3: Generate the schema - print_step(3, "Generate Schema", "Sending the schema generation request to the API") - - try: - response = client.generate_schema(user_prompt) - print("✅ Schema generation request sent successfully") - print_response(response, "Initial Response") - - # Extract the request ID for status checking - request_id = response.get('request_id') - if not request_id: - print("❌ No request ID returned from the API") - return - - except Exception as e: - print(f"❌ Failed to generate schema: {e}") - return - - # Step 4: Check the status (polling) - print_step(4, "Check Status", "Polling the API to check the status of the request") - - max_attempts = 10 - attempt = 0 - - while attempt < max_attempts: - attempt += 1 - print(f"🔍 Attempt {attempt}/{max_attempts}: Checking status...") - - try: - status_response = client.get_schema_status(request_id) - current_status = status_response.get('status', 'unknown') - - print(f"📊 Current Status: {current_status}") - - if current_status == 'completed': - print("✅ Schema generation completed successfully!") - print_response(status_response, "Final Result") - break - elif current_status == 'failed': - print("❌ Schema generation failed") - print_response(status_response, "Error Response") - break - elif current_status in ['pending', 'processing']: - print("⏳ Request is still being processed, waiting...") - if attempt < max_attempts: - time.sleep(2) # Wait 2 seconds before next check - else: - print(f"⚠️ Unknown status: {current_status}") - break - - except Exception as e: - print(f"❌ Error checking status: {e}") - break - - if attempt >= max_attempts: - print("⚠️ Maximum attempts reached. The request might still be processing.") - print("You can check the status later using the request ID.") - - # Step 5: Demonstrate schema modification - print_step(5, "Schema Modification", "Demonstrating how to modify an existing schema") - - existing_schema = { - "type": "object", - "properties": { - "name": {"type": "string"}, - "price": {"type": "number"}, - }, - "required": ["name", "price"], - } - - modification_prompt = "Add brand and rating fields to the existing schema" - print(f"💭 Modification Prompt: {modification_prompt}") - print(f"📋 Existing Schema: {json.dumps(existing_schema, indent=2)}") - - try: - modification_response = client.generate_schema(modification_prompt, existing_schema) - print("✅ Schema modification request sent successfully") - print_response(modification_response, "Modification Response") - - except Exception as e: - print(f"❌ Failed to modify schema: {e}") - - # Step 6: Cleanup - print_step(6, "Cleanup", "Closing the client to free up resources") - - try: - client.close() - print("✅ Client closed successfully") - except Exception as e: - print(f"⚠️ Warning: Error closing client: {e}") - - print("\n🎉 Schema generation demonstration completed!") - print(f"📝 Request ID for reference: {request_id}") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/toon_async_example.py b/scrapegraph-py/examples/toon_async_example.py deleted file mode 100644 index 2ffea9d..0000000 --- a/scrapegraph-py/examples/toon_async_example.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -""" -Async example demonstrating TOON format integration with ScrapeGraph SDK. - -TOON (Token-Oriented Object Notation) reduces token usage by 30-60% compared to JSON, -which can significantly reduce costs when working with LLM APIs. - -This example shows how to use the `return_toon` parameter with various async scraping methods. -""" -import asyncio -import os -from scrapegraph_py import AsyncClient - - -async def main(): - """Demonstrate TOON format with different async scraping methods.""" - - # Set your API key as an environment variable - # export SGAI_API_KEY="your-api-key-here" - # or set it in your .env file - - # Initialize the async client - async with AsyncClient.from_env() as client: - print("🎨 Async TOON Format Integration Example\n") - print("=" * 60) - - # Example 1: SmartScraper with TOON format - print("\n📌 Example 1: Async SmartScraper with TOON Format") - print("-" * 60) - - try: - # Request with return_toon=False (default JSON response) - json_response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the page title and main heading", - return_toon=False - ) - - print("\nJSON Response:") - print(json_response) - - # Request with return_toon=True (TOON formatted response) - toon_response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the page title and main heading", - return_toon=True - ) - - print("\nTOON Response:") - print(toon_response) - - # Compare token sizes (approximate) - if isinstance(json_response, dict): - import json - json_str = json.dumps(json_response) - json_tokens = len(json_str.split()) - toon_tokens = len(str(toon_response).split()) - - savings = ((json_tokens - toon_tokens) / json_tokens) * 100 if json_tokens > 0 else 0 - - print(f"\n📊 Token Comparison:") - print(f" JSON tokens (approx): {json_tokens}") - print(f" TOON tokens (approx): {toon_tokens}") - print(f" Savings: {savings:.1f}%") - - except Exception as e: - print(f"Error in Example 1: {e}") - - # Example 2: SearchScraper with TOON format - print("\n\n📌 Example 2: Async SearchScraper with TOON Format") - print("-" * 60) - - try: - # Request with TOON format - toon_search_response = await client.searchscraper( - user_prompt="Latest AI developments in 2024", - num_results=3, - return_toon=True - ) - - print("\nTOON Search Response:") - print(toon_search_response) - - except Exception as e: - print(f"Error in Example 2: {e}") - - # Example 3: Markdownify with TOON format - print("\n\n📌 Example 3: Async Markdownify with TOON Format") - print("-" * 60) - - try: - # Request with TOON format - toon_markdown_response = await client.markdownify( - website_url="https://example.com", - return_toon=True - ) - - print("\nTOON Markdown Response:") - print(str(toon_markdown_response)[:500]) # Print first 500 chars - print("...(truncated)") - - except Exception as e: - print(f"Error in Example 3: {e}") - - print("\n\n✅ Async TOON Integration Examples Completed!") - print("=" * 60) - print("\n💡 Benefits of TOON Format:") - print(" • 30-60% reduction in token usage") - print(" • Lower LLM API costs") - print(" • Faster processing") - print(" • Human-readable format") - print("\n🔗 Learn more: https://github.com/ScrapeGraphAI/toonify") - - -if __name__ == "__main__": - asyncio.run(main()) - diff --git a/scrapegraph-py/examples/toon_example.py b/scrapegraph-py/examples/toon_example.py deleted file mode 100644 index e4e2921..0000000 --- a/scrapegraph-py/examples/toon_example.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -""" -Example demonstrating TOON format integration with ScrapeGraph SDK. - -TOON (Token-Oriented Object Notation) reduces token usage by 30-60% compared to JSON, -which can significantly reduce costs when working with LLM APIs. - -This example shows how to use the `return_toon` parameter with various scraping methods. -""" -import os -from scrapegraph_py import Client - -# Set your API key as an environment variable -# export SGAI_API_KEY="your-api-key-here" -# or set it in your .env file - - -def main(): - """Demonstrate TOON format with different scraping methods.""" - - # Initialize the client - client = Client.from_env() - - print("🎨 TOON Format Integration Example\n") - print("=" * 60) - - # Example 1: SmartScraper with TOON format - print("\n📌 Example 1: SmartScraper with TOON Format") - print("-" * 60) - - try: - # Request with return_toon=False (default JSON response) - json_response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the page title and main heading", - return_toon=False - ) - - print("\nJSON Response:") - print(json_response) - - # Request with return_toon=True (TOON formatted response) - toon_response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the page title and main heading", - return_toon=True - ) - - print("\nTOON Response:") - print(toon_response) - - # Compare token sizes (approximate) - if isinstance(json_response, dict): - import json - json_str = json.dumps(json_response) - json_tokens = len(json_str.split()) - toon_tokens = len(str(toon_response).split()) - - savings = ((json_tokens - toon_tokens) / json_tokens) * 100 if json_tokens > 0 else 0 - - print(f"\n📊 Token Comparison:") - print(f" JSON tokens (approx): {json_tokens}") - print(f" TOON tokens (approx): {toon_tokens}") - print(f" Savings: {savings:.1f}%") - - except Exception as e: - print(f"Error in Example 1: {e}") - - # Example 2: SearchScraper with TOON format - print("\n\n📌 Example 2: SearchScraper with TOON Format") - print("-" * 60) - - try: - # Request with TOON format - toon_search_response = client.searchscraper( - user_prompt="Latest AI developments in 2024", - num_results=3, - return_toon=True - ) - - print("\nTOON Search Response:") - print(toon_search_response) - - except Exception as e: - print(f"Error in Example 2: {e}") - - # Example 3: Markdownify with TOON format - print("\n\n📌 Example 3: Markdownify with TOON Format") - print("-" * 60) - - try: - # Request with TOON format - toon_markdown_response = client.markdownify( - website_url="https://example.com", - return_toon=True - ) - - print("\nTOON Markdown Response:") - print(str(toon_markdown_response)[:500]) # Print first 500 chars - print("...(truncated)") - - except Exception as e: - print(f"Error in Example 3: {e}") - - print("\n\n✅ TOON Integration Examples Completed!") - print("=" * 60) - print("\n💡 Benefits of TOON Format:") - print(" • 30-60% reduction in token usage") - print(" • Lower LLM API costs") - print(" • Faster processing") - print(" • Human-readable format") - print("\n🔗 Learn more: https://github.com/ScrapeGraphAI/toonify") - - -if __name__ == "__main__": - main() - diff --git a/scrapegraph-py/examples/utilities/async_scrape_example.py b/scrapegraph-py/examples/utilities/async_scrape_example.py deleted file mode 100644 index 0a6c227..0000000 --- a/scrapegraph-py/examples/utilities/async_scrape_example.py +++ /dev/null @@ -1,278 +0,0 @@ -""" -Async example demonstrating how to use the Scrape API with the scrapegraph-py SDK. - -This example shows how to: -1. Set up the async client for Scrape -2. Make async API calls to get HTML content from websites -3. Handle responses and save HTML content -4. Demonstrate both regular and heavy JS rendering modes -5. Process multiple websites concurrently - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- aiofiles (for async file operations) -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import asyncio -import json -import os -import time -from pathlib import Path -from typing import Optional - -from dotenv import load_dotenv - -from scrapegraph_py import AsyncClient - -# Load environment variables from .env file -load_dotenv() - - -async def scrape_website( - client: AsyncClient, - website_url: str, - render_heavy_js: bool = False, - headers: Optional[dict[str, str]] = None, -) -> dict: - """ - Get HTML content from a website using the async Scrape API. - - Args: - client: The async scrapegraph-py client instance - website_url: The URL of the website to get HTML from - render_heavy_js: Whether to render heavy JavaScript (defaults to False) - headers: Optional headers to send with the request - - Returns: - dict: A dictionary containing the HTML content and metadata - - Raises: - Exception: If the API request fails - """ - js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" - print(f"Getting HTML content from: {website_url}") - print(f"Mode: {js_mode}") - - start_time = time.time() - - try: - result = await client.scrape( - website_url=website_url, - render_heavy_js=render_heavy_js, - headers=headers, - ) - execution_time = time.time() - start_time - print(f"Execution time: {execution_time:.2f} seconds") - return result - except Exception as e: - print(f"Error: {str(e)}") - raise - - -async def save_html_content( - html_content: str, filename: str, output_dir: str = "async_scrape_output" -): - """ - Save HTML content to a file asynchronously. - - Args: - html_content: The HTML content to save - filename: The name of the file (without extension) - output_dir: The directory to save the file in - """ - # Create output directory if it doesn't exist - output_path = Path(output_dir) - output_path.mkdir(exist_ok=True) - - # Save HTML file - html_file = output_path / f"{filename}.html" - - # Use asyncio to run file I/O in a thread pool - await asyncio.to_thread( - lambda: html_file.write_text(html_content, encoding="utf-8") - ) - - print(f"HTML content saved to: {html_file}") - return html_file - - -def analyze_html_content(html_content: str) -> dict: - """ - Analyze HTML content and provide basic statistics. - - Args: - html_content: The HTML content to analyze - - Returns: - dict: Basic statistics about the HTML content - """ - stats = { - "total_length": len(html_content), - "lines": len(html_content.splitlines()), - "has_doctype": html_content.strip().startswith(" dict: - """ - Process a single website and return results. - - Args: - client: The async client instance - website: Website configuration dictionary - - Returns: - dict: Processing results - """ - print(f"\nProcessing: {website['description']}") - print("-" * 40) - - try: - # Get HTML content - result = await scrape_website( - client=client, - website_url=website["url"], - render_heavy_js=website["render_heavy_js"], - ) - - # Display response metadata - print(f"Request ID: {result.get('scrape_request_id', 'N/A')}") - print(f"Status: {result.get('status', 'N/A')}") - print(f"Error: {result.get('error', 'None')}") - - # Analyze HTML content - html_content = result.get("html", "") - if html_content: - stats = analyze_html_content(html_content) - print(f"\nHTML Content Analysis:") - print(f" Total length: {stats['total_length']:,} characters") - print(f" Lines: {stats['lines']:,}") - print(f" Has DOCTYPE: {stats['has_doctype']}") - print(f" Has HTML tag: {stats['has_html_tag']}") - print(f" Has Head tag: {stats['has_head_tag']}") - print(f" Has Body tag: {stats['has_body_tag']}") - print(f" Script tags: {stats['script_tags']}") - print(f" Style tags: {stats['style_tags']}") - print(f" Div tags: {stats['div_tags']}") - print(f" Paragraph tags: {stats['p_tags']}") - print(f" Image tags: {stats['img_tags']}") - print(f" Link tags: {stats['link_tags']}") - - # Save HTML content - filename = f"{website['name']}_{'js' if website['render_heavy_js'] else 'nojs'}" - saved_file = await save_html_content(html_content, filename) - - # Show first 500 characters as preview - preview = html_content[:500].replace("\n", " ").strip() - print(f"\nHTML Preview (first 500 chars):") - print(f" {preview}...") - - return { - "success": True, - "website": website["url"], - "saved_file": str(saved_file), - "stats": stats, - "preview": preview - } - else: - print("No HTML content received") - return { - "success": False, - "website": website["url"], - "error": "No HTML content received" - } - - except Exception as e: - print(f"Error processing {website['url']}: {str(e)}") - return { - "success": False, - "website": website["url"], - "error": str(e) - } - - -async def main(): - """ - Main async function demonstrating Scrape API usage. - """ - # Example websites to test - test_websites = [ - { - "url": "https://example.com", - "name": "example", - "render_heavy_js": False, - "description": "Simple static website", - }, - { - "url": "https://httpbin.org/html", - "name": "httpbin_html", - "render_heavy_js": False, - "description": "HTTP testing service", - }, - ] - - print("Async Scrape API Example with scrapegraph-py SDK") - print("=" * 60) - - # Initialize the async client - try: - async with AsyncClient.from_env() as client: - print("✅ Async client initialized successfully") - - # Process websites concurrently - print(f"\n🚀 Processing {len(test_websites)} websites concurrently...") - - tasks = [ - process_website(client, website) - for website in test_websites - ] - - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Display summary - print(f"\n📊 Processing Summary") - print("=" * 40) - - successful = 0 - for result in results: - if isinstance(result, Exception): - print(f"❌ Exception occurred: {result}") - elif result["success"]: - successful += 1 - print(f"✅ {result['website']}: {result['saved_file']}") - else: - print(f"❌ {result['website']}: {result.get('error', 'Unknown error')}") - - print(f"\n🎯 Results: {successful}/{len(test_websites)} websites processed successfully") - - except Exception as e: - print(f"❌ Failed to initialize async client: {str(e)}") - print("Make sure you have SGAI_API_KEY in your .env file") - return - - print("\n✅ Async processing completed") - - -if __name__ == "__main__": - # Run the async main function - asyncio.run(main()) diff --git a/scrapegraph-py/examples/utilities/get_credits_example.py b/scrapegraph-py/examples/utilities/get_credits_example.py deleted file mode 100644 index 6ef9e2f..0000000 --- a/scrapegraph-py/examples/utilities/get_credits_example.py +++ /dev/null @@ -1,13 +0,0 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_level("DEBUG") - -# Initialize the client -sgai_client = Client(api_key="your-api-key-here") - -# Check remaining credits -credits = sgai_client.get_credits() -print(f"Credits Info: {credits}") - -sgai_client.close() diff --git a/scrapegraph-py/examples/utilities/healthz_async_example.py b/scrapegraph-py/examples/utilities/healthz_async_example.py deleted file mode 100644 index b8f9242..0000000 --- a/scrapegraph-py/examples/utilities/healthz_async_example.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Health Check Example - Asynchronous - -This example demonstrates how to use the health check endpoint asynchronously -to monitor the ScrapeGraphAI API service status. This is particularly useful for: -- Async production monitoring and alerting -- Health checks in async web frameworks (FastAPI, Sanic, aiohttp) -- Concurrent health monitoring of multiple services -- Integration with async monitoring tools - -The health check endpoint (/healthz) provides a quick way to verify that -the API service is operational and ready to handle requests. -""" - -import asyncio -from scrapegraph_py import AsyncClient - - -async def main(): - """ - Demonstrates the async health check functionality with the ScrapeGraphAI API. - - The healthz endpoint returns status information about the service, - which can be used for monitoring and alerting purposes. - """ - # Initialize the async client from environment variables - # Ensure SGAI_API_KEY is set in your environment - async with AsyncClient.from_env() as client: - try: - print("🏥 Checking ScrapeGraphAI API health status (async)...") - print("-" * 50) - - # Perform health check - health_status = await client.healthz() - - # Display results - print("\n✅ Health Check Response:") - print(f"Status: {health_status.get('status', 'unknown')}") - - if 'message' in health_status: - print(f"Message: {health_status['message']}") - - # Additional fields that might be returned - for key, value in health_status.items(): - if key not in ['status', 'message']: - print(f"{key.capitalize()}: {value}") - - print("\n" + "-" * 50) - print("✨ Health check completed successfully!") - - # Example: Use in a monitoring context - if health_status.get('status') == 'healthy': - print("\n✓ Service is healthy and ready to accept requests") - else: - print("\n⚠️ Service may be experiencing issues") - - except Exception as e: - print(f"\n❌ Health check failed: {e}") - print("The service may be unavailable or experiencing issues") - - -async def monitoring_example(): - """ - Example of using health check in an async monitoring/alerting context. - - This function demonstrates how you might integrate the health check - into an async monitoring system or scheduled health check script. - """ - async with AsyncClient.from_env() as client: - try: - health_status = await client.healthz() - - # Simple health check logic - is_healthy = health_status.get('status') == 'healthy' - - if is_healthy: - print("✓ Health check passed") - return 0 # Success exit code - else: - print("✗ Health check failed") - return 1 # Failure exit code - - except Exception as e: - print(f"✗ Health check error: {e}") - return 2 # Error exit code - - -async def concurrent_health_checks(): - """ - Example of performing concurrent health checks. - - This demonstrates how you can efficiently check the health status - multiple times or monitor multiple aspects concurrently. - """ - async with AsyncClient.from_env() as client: - print("🏥 Performing concurrent health checks...") - - # Perform multiple health checks concurrently - results = await asyncio.gather( - client.healthz(), - client.healthz(), - client.healthz(), - return_exceptions=True - ) - - # Analyze results - successful_checks = sum( - 1 for r in results - if isinstance(r, dict) and r.get('status') == 'healthy' - ) - - print(f"\n✓ Successful health checks: {successful_checks}/{len(results)}") - - if successful_checks == len(results): - print("✓ All health checks passed - service is stable") - elif successful_checks > 0: - print("⚠️ Some health checks failed - service may be unstable") - else: - print("✗ All health checks failed - service is down") - - -async def fastapi_health_endpoint_example(): - """ - Example of how to integrate the health check into a FastAPI endpoint. - - This demonstrates a pattern for creating a health check endpoint - in your own FastAPI application that checks the ScrapeGraphAI API. - """ - # This is a demonstration of the pattern, not a runnable endpoint - print("\n📝 FastAPI Integration Pattern:") - print("-" * 50) - print(""" -from fastapi import FastAPI, HTTPException -from scrapegraph_py import AsyncClient - -app = FastAPI() - -@app.get("/health") -async def health_check(): - '''Health check endpoint that verifies ScrapeGraphAI API status''' - try: - async with AsyncClient.from_env() as client: - health = await client.healthz() - - if health.get('status') == 'healthy': - return { - "status": "healthy", - "scrape_graph_api": "operational" - } - else: - raise HTTPException( - status_code=503, - detail="ScrapeGraphAI API is unhealthy" - ) - except Exception as e: - raise HTTPException( - status_code=503, - detail=f"Health check failed: {str(e)}" - ) - """) - print("-" * 50) - - -if __name__ == "__main__": - # Run the main health check example - asyncio.run(main()) - - # Uncomment to run other examples - # exit_code = asyncio.run(monitoring_example()) - # exit(exit_code) - - # asyncio.run(concurrent_health_checks()) - # asyncio.run(fastapi_health_endpoint_example()) - diff --git a/scrapegraph-py/examples/utilities/healthz_example.py b/scrapegraph-py/examples/utilities/healthz_example.py deleted file mode 100644 index 3362a1e..0000000 --- a/scrapegraph-py/examples/utilities/healthz_example.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Health Check Example - Synchronous - -This example demonstrates how to use the health check endpoint to monitor -the ScrapeGraphAI API service status. This is particularly useful for: -- Production monitoring and alerting -- Health checks in containerized environments (Kubernetes, Docker) -- Ensuring service availability before making API calls -- Integration with monitoring tools (Prometheus, Datadog, etc.) - -The health check endpoint (/healthz) provides a quick way to verify that -the API service is operational and ready to handle requests. -""" - -from scrapegraph_py import Client - -def main(): - """ - Demonstrates the health check functionality with the ScrapeGraphAI API. - - The healthz endpoint returns status information about the service, - which can be used for monitoring and alerting purposes. - """ - # Initialize the client from environment variables - # Ensure SGAI_API_KEY is set in your environment - client = Client.from_env() - - try: - print("🏥 Checking ScrapeGraphAI API health status...") - print("-" * 50) - - # Perform health check - health_status = client.healthz() - - # Display results - print("\n✅ Health Check Response:") - print(f"Status: {health_status.get('status', 'unknown')}") - - if 'message' in health_status: - print(f"Message: {health_status['message']}") - - # Additional fields that might be returned - for key, value in health_status.items(): - if key not in ['status', 'message']: - print(f"{key.capitalize()}: {value}") - - print("\n" + "-" * 50) - print("✨ Health check completed successfully!") - - # Example: Use in a monitoring context - if health_status.get('status') == 'healthy': - print("\n✓ Service is healthy and ready to accept requests") - else: - print("\n⚠️ Service may be experiencing issues") - - except Exception as e: - print(f"\n❌ Health check failed: {e}") - print("The service may be unavailable or experiencing issues") - - finally: - # Clean up - client.close() - - -def monitoring_example(): - """ - Example of using health check in a monitoring/alerting context. - - This function demonstrates how you might integrate the health check - into a monitoring system or scheduled health check script. - """ - client = Client.from_env() - - try: - health_status = client.healthz() - - # Simple health check logic - is_healthy = health_status.get('status') == 'healthy' - - if is_healthy: - print("✓ Health check passed") - return 0 # Success exit code - else: - print("✗ Health check failed") - return 1 # Failure exit code - - except Exception as e: - print(f"✗ Health check error: {e}") - return 2 # Error exit code - - finally: - client.close() - - -if __name__ == "__main__": - # Run the main health check example - main() - - # Uncomment to run monitoring example - # exit_code = monitoring_example() - # exit(exit_code) - diff --git a/scrapegraph-py/examples/utilities/optional_headers_example.py b/scrapegraph-py/examples/utilities/optional_headers_example.py deleted file mode 100644 index 7763f8f..0000000 --- a/scrapegraph-py/examples/utilities/optional_headers_example.py +++ /dev/null @@ -1,28 +0,0 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - -# Initialize the client with explicit API key -sgai_client = Client(api_key="your-api-key-here") - -# SmartScraper request -response = sgai_client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the main heading, description, and summary of the webpage", - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - }, -) - - -# Print the response -print(f"Request ID: {response['request_id']}") -print(f"Result: {response['result']}") - -sgai_client.close() diff --git a/scrapegraph-py/examples/utilities/scrape_direct_api_example.py b/scrapegraph-py/examples/utilities/scrape_direct_api_example.py deleted file mode 100644 index ba2c9a8..0000000 --- a/scrapegraph-py/examples/utilities/scrape_direct_api_example.py +++ /dev/null @@ -1,342 +0,0 @@ -""" -Direct API example showing how to use the Scrape API endpoint directly. - -This example demonstrates: -1. Direct API calls using requests library (equivalent to curl) -2. How to construct the API request manually -3. Comparison with the scrapegraph-py SDK -4. Error handling for direct API calls -5. The exact curl commands for each request - -Curl command examples: -# Basic scrape request -curl -X POST https://api.scrapegraphai.com/v1/scrape \ - -H "Content-Type: application/json" \ - -H "SGAI-APIKEY: sgai-e32215fb-5940-400f-91ea-30af5f35e0c9" \ - -d '{ - "website_url": "https://example.com", - "render_heavy_js": false - }' - -# With heavy JavaScript rendering -curl -X POST https://api.scrapegraphai.com/v1/scrape \ - -H "Content-Type: application/json" \ - -H "SGAI-APIKEY: sgai-e32215fb-5940-400f-91ea-30af5f35e0c9" \ - -d '{ - "website_url": "https://example.com", - "render_heavy_js": true - }' - -Requirements: -- Python 3.7+ -- requests -- scrapegraph-py -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import json -import time -from typing import Dict, Any, Optional - -import requests -from dotenv import load_dotenv -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -class DirectScrapeAPI: - """ - Direct API client for the Scrape endpoint (without using scrapegraph-py SDK). - This demonstrates how to make raw API calls equivalent to curl commands. - """ - - def __init__(self, api_key: str, base_url: str = "https://api.scrapegraphai.com/v1"): - """ - Initialize the direct API client. - - Args: - api_key: Your ScrapeGraph AI API key - base_url: Base URL for the API - """ - self.api_key = api_key - self.base_url = base_url - self.headers = { - "Content-Type": "application/json", - "SGAI-APIKEY": api_key - } - - def scrape( - self, - website_url: str, - render_heavy_js: bool = False, - headers: Optional[Dict[str, str]] = None - ) -> Dict[str, Any]: - """ - Make a direct scrape API request. - - Args: - website_url: The URL to scrape - render_heavy_js: Whether to render heavy JavaScript - headers: Optional headers to send with the scraping request - - Returns: - API response as dictionary - - Raises: - requests.RequestException: If the API request fails - """ - url = f"{self.base_url}/scrape" - - payload = { - "website_url": website_url, - "render_heavy_js": render_heavy_js - } - - if headers: - payload["headers"] = headers - - print(f"🌐 Making direct API request to: {url}") - print(f"📋 Payload: {json.dumps(payload, indent=2)}") - - try: - response = requests.post( - url, - json=payload, - headers=self.headers, - timeout=30 - ) - - print(f"📥 Response Status: {response.status_code}") - - # Handle different response status codes - if response.status_code == 200: - result = response.json() - print(f"✅ Request successful") - return result - elif response.status_code == 400: - error_data = response.json() - raise requests.RequestException(f"Bad Request: {error_data.get('error', 'Unknown error')}") - elif response.status_code == 401: - raise requests.RequestException("Unauthorized: Check your API key") - elif response.status_code == 429: - raise requests.RequestException("Rate limit exceeded") - elif response.status_code == 500: - raise requests.RequestException("Internal server error") - else: - raise requests.RequestException(f"Unexpected status code: {response.status_code}") - - except requests.Timeout: - raise requests.RequestException("Request timeout - API took too long to respond") - except requests.ConnectionError: - raise requests.RequestException("Connection error - unable to reach API") - except json.JSONDecodeError: - raise requests.RequestException("Invalid JSON response from API") - - -def demonstrate_curl_commands(): - """ - Display the equivalent curl commands for the API requests. - """ - print("🌐 EQUIVALENT CURL COMMANDS") - print("=" * 50) - - print("1️⃣ Basic scrape request (render_heavy_js=false):") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": false") - print(" }'") - - print("\n2️⃣ Heavy JS rendering (render_heavy_js=true):") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": true") - print(" }'") - - print("\n3️⃣ With custom headers:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": false,") - print(" \"headers\": {") - print(" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",") - print(" \"Accept-Language\": \"en-US,en;q=0.9\",") - print(" \"Cookie\": \"session=abc123; preferences=dark_mode\"") - print(" }") - print(" }'") - - print("\n4️⃣ Real example with actual API key format:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: sgai-e32215fb-5940-400f-91ea-30af5f35e0c9\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": false") - print(" }'") - - -def compare_direct_vs_sdk(api_key: str, website_url: str): - """ - Compare direct API calls vs SDK usage. - - Args: - api_key: API key for authentication - website_url: URL to scrape - """ - print(f"\n🔄 COMPARISON: Direct API vs SDK") - print("=" * 40) - - # Test with direct API - print("\n1️⃣ Using Direct API (equivalent to curl):") - try: - direct_client = DirectScrapeAPI(api_key) - start_time = time.time() - direct_result = direct_client.scrape(website_url, render_heavy_js=False) - direct_time = time.time() - start_time - - direct_html = direct_result.get("html", "") - print(f"✅ Direct API completed in {direct_time:.2f}s") - print(f"📏 HTML length: {len(direct_html):,} characters") - print(f"📋 Response keys: {list(direct_result.keys())}") - - except Exception as e: - print(f"❌ Direct API failed: {str(e)}") - direct_result = None - direct_time = 0 - - # Test with SDK - print("\n2️⃣ Using scrapegraph-py SDK:") - try: - sdk_client = Client(api_key=api_key) - start_time = time.time() - sdk_result = sdk_client.scrape(website_url, render_heavy_js=False) - sdk_time = time.time() - start_time - - sdk_html = sdk_result.get("html", "") - print(f"✅ SDK completed in {sdk_time:.2f}s") - print(f"📏 HTML length: {len(sdk_html):,} characters") - print(f"📋 Response keys: {list(sdk_result.keys())}") - - sdk_client.close() - - except Exception as e: - print(f"❌ SDK failed: {str(e)}") - sdk_result = None - sdk_time = 0 - - # Compare results - if direct_result and sdk_result: - print(f"\n📊 Comparison Results:") - print(f" Time difference: {abs(direct_time - sdk_time):.2f}s") - print(f" HTML length difference: {abs(len(direct_html) - len(sdk_html)):,} chars") - print(f" Results identical: {direct_result == sdk_result}") - - print(f"\n💡 Conclusions:") - print(f" • Both methods produce identical results") - print(f" • SDK provides better error handling and validation") - print(f" • Direct API gives you full control over requests") - print(f" • Choose SDK for ease of use, direct API for custom integrations") - - -def demonstrate_error_handling(api_key: str): - """ - Demonstrate error handling for direct API calls. - - Args: - api_key: API key for authentication - """ - print(f"\n🚨 ERROR HANDLING DEMONSTRATION") - print("=" * 40) - - direct_client = DirectScrapeAPI(api_key) - - # Test cases for different errors - error_tests = [ - { - "name": "Invalid URL", - "url": "not-a-valid-url", - "expected": "ValidationError" - }, - { - "name": "Empty URL", - "url": "", - "expected": "ValidationError" - }, - { - "name": "Non-existent domain", - "url": "https://this-domain-definitely-does-not-exist-12345.com", - "expected": "Connection/Timeout Error" - } - ] - - for test in error_tests: - print(f"\n🧪 Testing: {test['name']}") - print(f" URL: {test['url']}") - print(f" Expected: {test['expected']}") - - try: - result = direct_client.scrape(test["url"]) - print(f" ⚠️ Unexpected success: {result.get('status', 'Unknown')}") - except Exception as e: - print(f" ✅ Expected error caught: {str(e)}") - - -def main(): - """ - Main function demonstrating direct API usage. - """ - print("🚀 Scrape API: Direct API Usage Example") - print("=" * 50) - - # Show curl command equivalents - demonstrate_curl_commands() - - # Get API key from environment - import os - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("\n❌ Error: SGAI_API_KEY not found in environment variables") - print("Please add your API key to your .env file:") - print("SGAI_API_KEY=your-api-key-here") - return - - print(f"\n✅ API key loaded from environment") - - # Test website - test_url = "https://example.com" - - # Compare direct API vs SDK - compare_direct_vs_sdk(api_key, test_url) - - # Demonstrate error handling - demonstrate_error_handling(api_key) - - print(f"\n🎯 SUMMARY") - print("=" * 20) - print("✅ Direct API calls work identically to curl commands") - print("✅ SDK provides additional convenience and error handling") - print("✅ Both approaches produce the same results") - print("✅ Choose based on your integration needs") - - print(f"\n📚 Next Steps:") - print("• Try the curl commands in your terminal") - print("• Experiment with different render_heavy_js settings") - print("• Test with your own websites") - print("• Consider using the SDK for production applications") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/utilities/scrape_example.py b/scrapegraph-py/examples/utilities/scrape_example.py deleted file mode 100644 index 552d79f..0000000 --- a/scrapegraph-py/examples/utilities/scrape_example.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -Example demonstrating how to use the Scrape API with the scrapegraph-py SDK. - -This example shows how to: -1. Set up the client for Scrape -2. Make the API call to get HTML content from a website -3. Handle the response and save the HTML content -4. Demonstrate both regular and heavy JS rendering modes -5. Display the results and metadata - -Requirements: -- Python 3.7+ -- scrapegraph-py -- python-dotenv -- A .env file with your SGAI_API_KEY - -Example .env file: -SGAI_API_KEY=your_api_key_here -""" - -import json -import os -import time -from pathlib import Path -from typing import Optional - -from dotenv import load_dotenv - -from scrapegraph_py import Client - -# Load environment variables from .env file -load_dotenv() - - -def scrape_website( - client: Client, - website_url: str, - render_heavy_js: bool = False, - headers: Optional[dict[str, str]] = None, -) -> dict: - """ - Get HTML content from a website using the Scrape API. - - Args: - client: The scrapegraph-py client instance - website_url: The URL of the website to get HTML from - render_heavy_js: Whether to render heavy JavaScript (defaults to False) - headers: Optional headers to send with the request - - Returns: - dict: A dictionary containing the HTML content and metadata - - Raises: - Exception: If the API request fails - """ - js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" - print(f"Getting HTML content from: {website_url}") - print(f"Mode: {js_mode}") - - start_time = time.time() - - try: - result = client.scrape( - website_url=website_url, - render_heavy_js=render_heavy_js, - headers=headers, - ) - execution_time = time.time() - start_time - print(f"Execution time: {execution_time:.2f} seconds") - return result - except Exception as e: - print(f"Error: {str(e)}") - raise - - -def save_html_content( - html_content: str, filename: str, output_dir: str = "scrape_output" -): - """ - Save HTML content to a file. - - Args: - html_content: The HTML content to save - filename: The name of the file (without extension) - output_dir: The directory to save the file in - """ - # Create output directory if it doesn't exist - output_path = Path(output_dir) - output_path.mkdir(exist_ok=True) - - # Save HTML file - html_file = output_path / f"{filename}.html" - with open(html_file, "w", encoding="utf-8") as f: - f.write(html_content) - - print(f"HTML content saved to: {html_file}") - return html_file - - -def analyze_html_content(html_content: str) -> dict: - """ - Analyze HTML content and provide basic statistics. - - Args: - html_content: The HTML content to analyze - - Returns: - dict: Basic statistics about the HTML content - """ - stats = { - "total_length": len(html_content), - "lines": len(html_content.splitlines()), - "has_doctype": html_content.strip().startswith(" Dict[str, Any]: - """ - Compare scraping results with and without heavy JS rendering. - - Args: - client: The scrapegraph-py client instance - website_url: The URL to scrape - headers: Optional headers to send with the request - - Returns: - Dict containing comparison results - """ - print(f"🌐 Scraping {website_url} with comparison...") - print("=" * 60) - - results = {} - - # Test without heavy JS rendering (default) - print("\n1️⃣ Scraping WITHOUT heavy JS rendering...") - start_time = time.time() - - try: - result_no_js = client.scrape( - website_url=website_url, - render_heavy_js=False, - headers=headers - ) - no_js_time = time.time() - start_time - - html_no_js = result_no_js.get("html", "") - results["no_js"] = { - "success": True, - "html_length": len(html_no_js), - "execution_time": no_js_time, - "html_content": html_no_js, - "result": result_no_js - } - - print(f"✅ Completed in {no_js_time:.2f} seconds") - print(f"📏 HTML length: {len(html_no_js):,} characters") - - except Exception as e: - results["no_js"] = { - "success": False, - "error": str(e), - "execution_time": time.time() - start_time - } - print(f"❌ Failed: {str(e)}") - - # Test with heavy JS rendering - print("\n2️⃣ Scraping WITH heavy JS rendering...") - start_time = time.time() - - try: - result_with_js = client.scrape( - website_url=website_url, - render_heavy_js=True, - headers=headers - ) - with_js_time = time.time() - start_time - - html_with_js = result_with_js.get("html", "") - results["with_js"] = { - "success": True, - "html_length": len(html_with_js), - "execution_time": with_js_time, - "html_content": html_with_js, - "result": result_with_js - } - - print(f"✅ Completed in {with_js_time:.2f} seconds") - print(f"📏 HTML length: {len(html_with_js):,} characters") - - except Exception as e: - results["with_js"] = { - "success": False, - "error": str(e), - "execution_time": time.time() - start_time - } - print(f"❌ Failed: {str(e)}") - - return results - - -def analyze_differences(results: Dict[str, Any]) -> Dict[str, Any]: - """ - Analyze the differences between JS and non-JS rendering results. - - Args: - results: Results from scrape_with_comparison - - Returns: - Analysis results - """ - print("\n🔍 ANALYSIS: Comparing Results") - print("=" * 40) - - analysis = {} - - if results["no_js"]["success"] and results["with_js"]["success"]: - no_js_html = results["no_js"]["html_content"] - with_js_html = results["with_js"]["html_content"] - - # Length comparison - length_diff = results["with_js"]["html_length"] - results["no_js"]["html_length"] - length_percent = (length_diff / results["no_js"]["html_length"]) * 100 if results["no_js"]["html_length"] > 0 else 0 - - # Time comparison - time_diff = results["with_js"]["execution_time"] - results["no_js"]["execution_time"] - time_percent = (time_diff / results["no_js"]["execution_time"]) * 100 if results["no_js"]["execution_time"] > 0 else 0 - - # Content analysis - no_js_scripts = no_js_html.lower().count(" 1000: - print(" ✅ Heavy JS rendering captured significantly more content") - print(" ✅ Use render_heavy_js=True for this website") - elif length_diff > 0: - print(" ⚠️ Heavy JS rendering captured some additional content") - print(" ⚠️ Consider using render_heavy_js=True if you need dynamic content") - else: - print(" ℹ️ No significant difference in content") - print(" ℹ️ render_heavy_js=False is sufficient for this website") - - if time_diff > 5: - print(" ⚠️ Heavy JS rendering is significantly slower") - print(" ⚠️ Consider cost vs. benefit for your use case") - - else: - print("❌ Cannot compare - one or both requests failed") - if not results["no_js"]["success"]: - print(f" No JS error: {results['no_js'].get('error', 'Unknown')}") - if not results["with_js"]["success"]: - print(f" With JS error: {results['with_js'].get('error', 'Unknown')}") - - return analysis - - -def save_comparison_results(results: Dict[str, Any], analysis: Dict[str, Any], website_url: str): - """ - Save the comparison results to files. - - Args: - results: Scraping results - analysis: Analysis results - website_url: The scraped website URL - """ - print(f"\n💾 Saving comparison results...") - - # Create output directory - output_dir = Path("render_heavy_js_comparison") - output_dir.mkdir(exist_ok=True) - - # Save HTML files - if results["no_js"]["success"]: - no_js_file = output_dir / "scrape_no_js.html" - with open(no_js_file, "w", encoding="utf-8") as f: - f.write(results["no_js"]["html_content"]) - print(f"📄 No JS HTML saved to: {no_js_file}") - - if results["with_js"]["success"]: - with_js_file = output_dir / "scrape_with_js.html" - with open(with_js_file, "w", encoding="utf-8") as f: - f.write(results["with_js"]["html_content"]) - print(f"📄 With JS HTML saved to: {with_js_file}") - - # Save analysis report - report = { - "website_url": website_url, - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "results_summary": { - "no_js_success": results["no_js"]["success"], - "with_js_success": results["with_js"]["success"], - "no_js_html_length": results["no_js"].get("html_length", 0), - "with_js_html_length": results["with_js"].get("html_length", 0), - "no_js_time": results["no_js"].get("execution_time", 0), - "with_js_time": results["with_js"].get("execution_time", 0), - }, - "analysis": analysis - } - - report_file = output_dir / "comparison_report.json" - with open(report_file, "w", encoding="utf-8") as f: - json.dump(report, f, indent=2) - print(f"📊 Analysis report saved to: {report_file}") - - -def demonstrate_curl_equivalent(): - """ - Show the curl command equivalent for the scrape API calls. - """ - print(f"\n🌐 CURL COMMAND EQUIVALENTS") - print("=" * 50) - - print("1️⃣ Scrape WITHOUT heavy JS rendering:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": false") - print(" }'") - - print("\n2️⃣ Scrape WITH heavy JS rendering:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": true") - print(" }'") - - print("\n3️⃣ With custom headers:") - print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\") - print(" -H \"Content-Type: application/json\" \\") - print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") - print(" -d '{") - print(" \"website_url\": \"https://example.com\",") - print(" \"render_heavy_js\": true,") - print(" \"headers\": {") - print(" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",") - print(" \"Accept-Language\": \"en-US,en;q=0.9\"") - print(" }") - print(" }'") - - -def main(): - """ - Main function demonstrating render_heavy_js functionality. - """ - print("🚀 Scrape API: render_heavy_js Comparison Example") - print("=" * 60) - - # Test websites - mix of static and dynamic content - test_websites = [ - { - "url": "https://example.com", - "name": "Example.com (Static)", - "description": "Simple static website - minimal JS" - }, - { - "url": "https://httpbin.org/html", - "name": "HTTPBin HTML", - "description": "HTTP testing service - static HTML" - } - ] - - # Show curl equivalents first - demonstrate_curl_equivalent() - - # Initialize client - try: - client = Client.from_env() - print(f"\n✅ Client initialized successfully") - except Exception as e: - print(f"❌ Failed to initialize client: {str(e)}") - print("Make sure you have SGAI_API_KEY in your .env file") - return - - # Test each website - for website in test_websites: - print(f"\n{'='*80}") - print(f"🧪 TESTING: {website['name']}") - print(f"📝 Description: {website['description']}") - print(f"🔗 URL: {website['url']}") - print(f"{'='*80}") - - try: - # Perform comparison - results = scrape_with_comparison(client, website["url"]) - - # Analyze differences - analysis = analyze_differences(results) - - # Save results - save_comparison_results(results, analysis, website["url"]) - - except Exception as e: - print(f"❌ Error testing {website['url']}: {str(e)}") - - # Close client - client.close() - print(f"\n🔒 Client closed successfully") - - # Final recommendations - print(f"\n💡 GENERAL RECOMMENDATIONS") - print("=" * 30) - print("🔹 Use render_heavy_js=False (default) for:") - print(" • Static websites") - print(" • Simple content sites") - print(" • When speed is priority") - print(" • When cost optimization is important") - - print("\n🔹 Use render_heavy_js=True for:") - print(" • Single Page Applications (SPAs)") - print(" • React/Vue/Angular websites") - print(" • Sites with dynamic content loading") - print(" • When you need JavaScript-rendered content") - - print("\n🔹 Cost considerations:") - print(" • render_heavy_js=True takes longer and uses more resources") - print(" • Test both options to determine if the extra content is worth it") - print(" • Consider caching results for frequently accessed pages") - - -if __name__ == "__main__": - main() diff --git a/scrapegraph-py/examples/utilities/send_feedback_example.py b/scrapegraph-py/examples/utilities/send_feedback_example.py deleted file mode 100644 index 4c397ed..0000000 --- a/scrapegraph-py/examples/utilities/send_feedback_example.py +++ /dev/null @@ -1,28 +0,0 @@ -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") - -# Initialize the client -sgai_client = Client(api_key="your-api-key-here") - -# Example request_id (replace with an actual request_id from a previous request) -request_id = "your-request-id-here" - -# Check remaining credits -credits = sgai_client.get_credits() -print(f"Credits Info: {credits}") - -# Submit feedback for a previous request -feedback_response = sgai_client.submit_feedback( - request_id=request_id, - rating=5, # Rating from 1-5 - feedback_text="The extraction was accurate and exactly what I needed!", -) -print(f"\nFeedback Response: {feedback_response}") - -# Get previous results using get_smartscraper -previous_result = sgai_client.get_smartscraper(request_id=request_id) -print(f"\nRetrieved Previous Result: {previous_result}") - -sgai_client.close() diff --git a/scrapegraph-py/pyproject.toml b/scrapegraph-py/pyproject.toml index 95ec2db..3ef69ed 100644 --- a/scrapegraph-py/pyproject.toml +++ b/scrapegraph-py/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraph_py" -version = "1.12.2" +version = "2.0.0" description = "ScrapeGraph Python SDK for API" authors = [ { name = "Marco Vinciguerra", email = "marco@scrapegraphai.com" }, diff --git a/scrapegraph-py/scrapegraph_py/__init__.py b/scrapegraph-py/scrapegraph_py/__init__.py index 588effe..a34b34d 100644 --- a/scrapegraph-py/scrapegraph_py/__init__.py +++ b/scrapegraph-py/scrapegraph_py/__init__.py @@ -1,97 +1,59 @@ -""" -ScrapeGraphAI Python SDK - -A comprehensive Python SDK for the ScrapeGraphAI API, providing both synchronous -and asynchronous clients for all API endpoints. - -Main Features: - - SmartScraper: AI-powered web scraping with structured data extraction - - SearchScraper: Web research across multiple sources - - Agentic Scraper: Automated browser interactions and form filling - - Crawl: Website crawling with AI extraction or markdown conversion - - Markdownify: Convert web pages to clean markdown - - Schema Generation: AI-assisted schema creation for data extraction - - Scheduled Jobs: Automate recurring scraping tasks - -Quick Start: - >>> from scrapegraph_py import Client - >>> - >>> # Initialize client from environment variables - >>> client = Client.from_env() - >>> - >>> # Basic scraping - >>> result = client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract all product information" - ... ) - >>> - >>> # With context manager - >>> with Client.from_env() as client: - ... result = client.scrape(website_url="https://example.com") - -Async Usage: - >>> import asyncio - >>> from scrapegraph_py import AsyncClient - >>> - >>> async def main(): - ... async with AsyncClient.from_env() as client: - ... result = await client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract products" - ... ) - >>> - >>> asyncio.run(main()) - -For more information visit: https://scrapegraphai.com -Documentation: https://docs.scrapegraphai.com -""" - -from .async_client import AsyncClient -from .client import Client - -# Scrape Models -from .models.scrape import ( - ScrapeRequest, - GetScrapeRequest, -) - -# Scheduled Jobs Models -from .models.scheduled_jobs import ( - GetJobExecutionsRequest, - GetScheduledJobRequest, - GetScheduledJobsRequest, - JobActionRequest, - JobActionResponse, - JobExecutionListResponse, - JobExecutionResponse, - JobTriggerResponse, - ScheduledJobCreate, - ScheduledJobListResponse, - ScheduledJobResponse, - ScheduledJobUpdate, - ServiceType, - TriggerJobRequest, -) - -__all__ = [ - "Client", - "AsyncClient", - # Scrape Models - "ScrapeRequest", - "GetScrapeRequest", - # Scheduled Jobs Models - "ServiceType", - "ScheduledJobCreate", - "ScheduledJobUpdate", - "ScheduledJobResponse", - "ScheduledJobListResponse", - "JobExecutionResponse", - "JobExecutionListResponse", - "JobTriggerResponse", - "JobActionResponse", - "GetScheduledJobsRequest", - "GetScheduledJobRequest", - "GetJobExecutionsRequest", - "TriggerJobRequest", - "JobActionRequest", -] +""" +ScrapeGraphAI Python SDK v2 + +A Python SDK for the ScrapeGraphAI v2 API, providing both synchronous +and asynchronous clients for intelligent web scraping powered by AI. + +Quick Start: + >>> from scrapegraph_py import Client + >>> client = Client(api_key="sgai-...") + >>> result = client.scrape("https://example.com") + >>> result = client.extract("https://example.com", prompt="Extract prices") + >>> job = client.crawl.start("https://example.com", depth=3) + +Async Usage: + >>> import asyncio + >>> from scrapegraph_py import AsyncClient + >>> async def main(): + ... async with AsyncClient(api_key="sgai-...") as client: + ... result = await client.extract( + ... url="https://example.com", + ... prompt="Extract products" + ... ) + >>> asyncio.run(main()) +""" + +from .async_client import AsyncClient +from .client import Client +from .config import VERSION +from .models.crawl import CrawlFormat, CrawlRequest +from .models.extract import ExtractRequest +from .models.history import HistoryFilter +from .models.monitor import MonitorCreateRequest +from .models.scrape import ScrapeFormat, ScrapeRequest +from .models.search import SearchRequest +from .models.shared import FetchConfig, LlmConfig + +__version__ = VERSION + +__all__ = [ + "Client", + "AsyncClient", + # Shared config + "FetchConfig", + "LlmConfig", + # Scrape + "ScrapeFormat", + "ScrapeRequest", + # Extract + "ExtractRequest", + # Search + "SearchRequest", + # Crawl + "CrawlFormat", + "CrawlRequest", + # Monitor + "MonitorCreateRequest", + # History + "HistoryFilter", +] diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 5111849..c3279aa 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -1,111 +1,167 @@ """ -Asynchronous HTTP client for the ScrapeGraphAI API. - -This module provides an asynchronous client for interacting with all ScrapeGraphAI -API endpoints including smartscraper, searchscraper, crawl, agentic scraper, -markdownify, schema generation, scheduled jobs, and utility functions. - -The AsyncClient class supports: -- API key authentication -- SSL verification configuration -- Request timeout configuration -- Automatic retry logic with exponential backoff -- Mock mode for testing -- Async context manager support for proper resource cleanup -- Concurrent requests using asyncio +Asynchronous HTTP client for the ScrapeGraphAI v2 API. Example: - Basic usage with environment variables: - >>> import asyncio - >>> from scrapegraph_py import AsyncClient - >>> async def main(): - ... client = AsyncClient.from_env() - ... result = await client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract product information" - ... ) - ... await client.close() - >>> asyncio.run(main()) - - Using async context manager: - >>> async def main(): - ... async with AsyncClient(api_key="sgai-...") as client: - ... result = await client.scrape(website_url="https://example.com") - >>> asyncio.run(main()) + >>> import asyncio + >>> from scrapegraph_py import AsyncClient + >>> async def main(): + ... async with AsyncClient(api_key="sgai-...") as client: + ... result = await client.extract( + ... url="https://example.com", + ... prompt="Extract product information" + ... ) + ... print(result) + >>> asyncio.run(main()) """ + import asyncio -from typing import Any, Dict, Optional, Callable +from typing import Any, Dict, List, Optional from aiohttp import ClientSession, ClientTimeout, TCPConnector from aiohttp.client_exceptions import ClientError from pydantic import BaseModel -from urllib.parse import urlparse -import uuid as _uuid from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger -from scrapegraph_py.models.agenticscraper import ( - AgenticScraperRequest, - GetAgenticScraperRequest, -) -from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest -from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.scrape import GetScrapeRequest, ScrapeRequest -from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from scrapegraph_py.models.schema import ( - GenerateSchemaRequest, - GetSchemaStatusRequest, - SchemaGenerationResponse, -) -from scrapegraph_py.models.searchscraper import ( - GetSearchScraperRequest, - SearchScraperRequest, - TimeRange, -) -from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse -from scrapegraph_py.models.smartscraper import ( - GetSmartScraperRequest, - SmartScraperRequest, -) -from scrapegraph_py.models.scheduled_jobs import ( - GetJobExecutionsRequest, - GetScheduledJobRequest, - GetScheduledJobsRequest, - JobActionRequest, - ScheduledJobCreate, - ScheduledJobUpdate, - TriggerJobRequest, -) +from scrapegraph_py.models.crawl import CrawlFormat, CrawlRequest +from scrapegraph_py.models.extract import ExtractRequest +from scrapegraph_py.models.history import HistoryFilter +from scrapegraph_py.models.monitor import MonitorCreateRequest +from scrapegraph_py.models.scrape import ScrapeFormat, ScrapeRequest +from scrapegraph_py.models.search import SearchRequest +from scrapegraph_py.models.shared import FetchConfig, LlmConfig from scrapegraph_py.utils.helpers import handle_async_response, validate_api_key -from scrapegraph_py.utils.toon_converter import process_response_with_toon -class AsyncClient: - """ - Asynchronous client for the ScrapeGraphAI API. +class _AsyncCrawlNamespace: + """Namespaced async crawl operations.""" + + def __init__(self, client: "AsyncClient"): + self._client = client + + async def start( + self, + url: str, + depth: int = 2, + max_pages: int = 10, + format: str = "markdown", + include_patterns: Optional[List[str]] = None, + exclude_patterns: Optional[List[str]] = None, + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Start a crawl job.""" + logger.info(f"Starting crawl for {url}") + request = CrawlRequest( + url=url, + depth=depth, + max_pages=max_pages, + format=CrawlFormat(format), + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + fetch_config=fetch_config, + ) + return await self._client._make_request( + "POST", f"{self._client.base_url}/crawl", json=request.model_dump() + ) + + async def status(self, crawl_id: str) -> Dict[str, Any]: + """Get crawl job status and results.""" + logger.info(f"Fetching crawl status for {crawl_id}") + return await self._client._make_request( + "GET", f"{self._client.base_url}/crawl/{crawl_id}" + ) - This class provides asynchronous methods for all ScrapeGraphAI API endpoints. - It handles authentication, request management, error handling, and supports - mock mode for testing. Uses aiohttp for efficient async HTTP requests. + async def stop(self, crawl_id: str) -> Dict[str, Any]: + """Stop a running crawl job.""" + logger.info(f"Stopping crawl {crawl_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/stop" + ) + + async def resume(self, crawl_id: str) -> Dict[str, Any]: + """Resume a stopped crawl job.""" + logger.info(f"Resuming crawl {crawl_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/resume" + ) + + +class _AsyncMonitorNamespace: + """Namespaced async monitor operations.""" + + def __init__(self, client: "AsyncClient"): + self._client = client + + async def create( + self, + name: str, + url: str, + prompt: str, + cron: str, + output_schema: Optional[Dict[str, Any]] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Create a new monitor.""" + logger.info(f"Creating monitor '{name}' for {url}") + request = MonitorCreateRequest( + name=name, + url=url, + prompt=prompt, + cron=cron, + output_schema=output_schema, + fetch_config=fetch_config, + llm_config=llm_config, + ) + return await self._client._make_request( + "POST", f"{self._client.base_url}/monitor", json=request.model_dump() + ) - Attributes: - api_key (str): The API key for authentication - headers (dict): Default headers including API key - timeout (ClientTimeout): Request timeout configuration - max_retries (int): Maximum number of retry attempts - retry_delay (float): Base delay between retries in seconds - mock (bool): Whether mock mode is enabled - session (ClientSession): Aiohttp session for connection pooling + async def list(self) -> Dict[str, Any]: + """List all monitors.""" + logger.info("Listing monitors") + return await self._client._make_request("GET", f"{self._client.base_url}/monitor") + + async def get(self, monitor_id: str) -> Dict[str, Any]: + """Get a specific monitor.""" + logger.info(f"Fetching monitor {monitor_id}") + return await self._client._make_request( + "GET", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + async def pause(self, monitor_id: str) -> Dict[str, Any]: + """Pause a monitor.""" + logger.info(f"Pausing monitor {monitor_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/pause" + ) + + async def resume(self, monitor_id: str) -> Dict[str, Any]: + """Resume a paused monitor.""" + logger.info(f"Resuming monitor {monitor_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/resume" + ) + + async def delete(self, monitor_id: str) -> Dict[str, Any]: + """Delete a monitor.""" + logger.info(f"Deleting monitor {monitor_id}") + return await self._client._make_request( + "DELETE", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + +class AsyncClient: + """Asynchronous client for the ScrapeGraphAI v2 API. Example: - >>> async def example(): - ... async with AsyncClient.from_env() as client: - ... result = await client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract all products" - ... ) + >>> async with AsyncClient(api_key="sgai-...") as client: + ... result = await client.scrape("https://example.com") + ... result = await client.extract("https://example.com", prompt="Extract prices") + ... job = await client.crawl.start("https://example.com", depth=3) """ + @classmethod def from_env( cls, @@ -113,67 +169,42 @@ def from_env( timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: Optional[bool] = None, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, - ): - """Initialize AsyncClient using API key from environment variable. - - Args: - verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds - """ + ) -> "AsyncClient": + """Initialize AsyncClient using SGAI_API_KEY environment variable.""" from os import getenv - # Allow enabling mock mode from environment if not explicitly provided - if mock is None: - mock_env = getenv("SGAI_MOCK", "0").strip().lower() - mock = mock_env in {"1", "true", "yes", "on"} - api_key = getenv("SGAI_API_KEY") - # In mock mode, we don't need a real API key if not api_key: - if mock: - api_key = "sgai-00000000-0000-0000-0000-000000000000" - else: - raise ValueError("SGAI_API_KEY environment variable not set") + raise ValueError("SGAI_API_KEY environment variable not set") return cls( api_key=api_key, verify_ssl=verify_ssl, timeout=timeout, max_retries=max_retries, retry_delay=retry_delay, - mock=bool(mock), - mock_handler=mock_handler, - mock_responses=mock_responses, ) def __init__( self, api_key: str = None, + base_url: Optional[str] = None, verify_ssl: bool = True, timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: bool = False, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, ): - """Initialize AsyncClient with configurable parameters. + """Initialize AsyncClient. Args: - api_key: API key for authentication. If None, will try to - load from environment + api_key: API key for authentication. If None, reads from SGAI_API_KEY env var + base_url: Override the default API base URL verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds + timeout: Request timeout in seconds (None = no timeout) + max_retries: Maximum retry attempts on server errors + retry_delay: Base delay between retries in seconds """ - logger.info("🔑 Initializing AsyncClient") + logger.info("Initializing AsyncClient") - # Try to get API key from environment if not provided if api_key is None: from os import getenv @@ -184,1141 +215,225 @@ def __init__( ) validate_api_key(api_key) - logger.debug( - f"🛠️ Configuration: verify_ssl={verify_ssl}, " - f"timeout={timeout}, max_retries={max_retries}" - ) + self.api_key = api_key - self.headers = {**DEFAULT_HEADERS, "SGAI-APIKEY": api_key} + self.base_url = (base_url or API_BASE_URL).rstrip("/") + self.headers = { + **DEFAULT_HEADERS, + "Authorization": f"Bearer {api_key}", + "SGAI-APIKEY": api_key, + } self.max_retries = max_retries self.retry_delay = retry_delay - self.mock = bool(mock) - self.mock_handler = mock_handler - self.mock_responses = mock_responses or {} ssl = None if verify_ssl else False self.timeout = ClientTimeout(total=timeout) if timeout is not None else None self.session = ClientSession( - headers=self.headers, connector=TCPConnector(ssl=ssl), timeout=self.timeout + headers=self.headers, + connector=TCPConnector(ssl=ssl), + timeout=self.timeout, ) - logger.info("✅ AsyncClient initialized successfully") - - async def _make_request(self, method: str, url: str, **kwargs) -> Any: - """ - Make asynchronous HTTP request with retry logic and error handling. - - Args: - method: HTTP method (GET, POST, etc.) - url: Full URL for the request - **kwargs: Additional arguments to pass to aiohttp - - Returns: - Parsed JSON response data + # Namespaced sub-clients + self.crawl = _AsyncCrawlNamespace(self) + self.monitor = _AsyncMonitorNamespace(self) - Raises: - APIError: If the API returns an error response - ConnectionError: If unable to connect after all retries + logger.info("AsyncClient initialized successfully") - Note: - In mock mode, this method returns deterministic responses without - making actual HTTP requests. - """ - # Short-circuit when mock mode is enabled - if getattr(self, "mock", False): - return self._mock_response(method, url, **kwargs) + async def _make_request(self, method: str, url: str, **kwargs: Any) -> Any: + """Make async HTTP request with retry logic.""" for attempt in range(self.max_retries): try: - logger.info( - f"🚀 Making {method} request to {url} " - f"(Attempt {attempt + 1}/{self.max_retries})" + logger.debug( + f"Making {method} request to {url} " + f"(attempt {attempt + 1}/{self.max_retries})" ) - logger.debug(f"🔍 Request parameters: {kwargs}") - async with self.session.request(method, url, **kwargs) as response: - logger.debug(f"📥 Response status: {response.status}") - result = await handle_async_response(response) - logger.info(f"✅ Request completed successfully: {method} {url}") - return result + return await handle_async_response(response) except ClientError as e: - logger.warning(f"⚠️ Request attempt {attempt + 1} failed: {str(e)}") + logger.warning(f"Request attempt {attempt + 1} failed: {e}") if hasattr(e, "status") and e.status is not None: try: error_data = await e.response.json() error_msg = error_data.get("error", str(e)) - logger.error(f"🔴 API Error: {error_msg}") raise APIError(error_msg, status_code=e.status) - except ValueError: - logger.error("🔴 Could not parse error response") + except (ValueError, AttributeError): raise APIError( str(e), - status_code=e.status if hasattr(e, "status") else None, + status_code=getattr(e, "status", None), ) if attempt == self.max_retries - 1: - logger.error(f"❌ All retry attempts failed for {method} {url}") - raise ConnectionError(f"Failed to connect to API: {str(e)}") + raise ConnectionError(f"Failed to connect to API: {e}") retry_delay = self.retry_delay * (attempt + 1) - logger.info(f"⏳ Waiting {retry_delay}s before retry {attempt + 2}") + logger.info(f"Waiting {retry_delay}s before retry {attempt + 2}") await asyncio.sleep(retry_delay) - def _mock_response(self, method: str, url: str, **kwargs) -> Any: - """Return a deterministic mock response without performing network I/O. - - Resolution order: - 1) If a custom mock_handler is provided, delegate to it - 2) If mock_responses contains a key for the request path, use it - 3) Fallback to built-in defaults per endpoint family - """ - logger.info(f"🧪 Mock mode active. Returning stub for {method} {url}") - - # 1) Custom handler - if self.mock_handler is not None: - try: - return self.mock_handler(method, url, kwargs) - except Exception as handler_error: - logger.warning(f"Custom mock_handler raised: {handler_error}. Falling back to defaults.") - - # 2) Path-based override - try: - parsed = urlparse(url) - path = parsed.path.rstrip("/") - except Exception: - path = url - - override = self.mock_responses.get(path) - if override is not None: - return override() if callable(override) else override - - # 3) Built-in defaults - def new_id(prefix: str) -> str: - return f"{prefix}-{_uuid.uuid4()}" - - upper_method = method.upper() - - # Credits endpoint - if path.endswith("/credits") and upper_method == "GET": - return {"remaining_credits": 1000, "total_credits_used": 0} - - # Health check endpoint - if path.endswith("/healthz") and upper_method == "GET": - return {"status": "healthy", "message": "Service is operational"} - - # Feedback acknowledge - if path.endswith("/feedback") and upper_method == "POST": - return {"status": "success"} - - # Create-like endpoints (POST) - if upper_method == "POST": - if path.endswith("/crawl"): - return {"crawl_id": new_id("mock-crawl")} - elif path.endswith("/scheduled-jobs"): - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/pause" in path: - return { - "message": "Job paused successfully", - "job_id": new_id("mock-job"), - "is_active": False - } - elif "/resume" in path: - return { - "message": "Job resumed successfully", - "job_id": new_id("mock-job"), - "is_active": True, - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/trigger" in path: - task_id = new_id("mock-task") - return { - "execution_id": task_id, - "scheduled_job_id": new_id("mock-job"), - "triggered_at": "2024-01-01T00:00:00Z", - "message": f"Job triggered successfully. Task ID: {task_id}" - } - # All other POST endpoints return a request id - return {"request_id": new_id("mock-req")} - - # Status-like endpoints (GET) - if upper_method == "GET": - if "markdownify" in path: - return {"status": "completed", "content": "# Mock markdown\n\n..."} - if "smartscraper" in path: - return {"status": "completed", "result": [{"field": "value"}]} - if "searchscraper" in path: - return { - "status": "completed", - "results": [{"url": "https://example.com"}], - "markdown_content": "# Mock Markdown Content\n\nThis is mock markdown content for testing purposes.\n\n## Section 1\n\nSome content here.\n\n## Section 2\n\nMore content here.", - "reference_urls": ["https://example.com", "https://example2.com"] - } - if "crawl" in path: - return {"status": "completed", "pages": []} - if "agentic-scrapper" in path: - return {"status": "completed", "actions": []} - if "scheduled-jobs" in path: - if "/executions" in path: - return { - "executions": [ - { - "id": new_id("mock-exec"), - "scheduled_job_id": new_id("mock-job"), - "execution_id": new_id("mock-task"), - "status": "completed", - "started_at": "2024-01-01T00:00:00Z", - "completed_at": "2024-01-01T00:01:00Z", - "result": {"mock": "result"}, - "credits_used": 10 - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - elif path.endswith("/scheduled-jobs"): # List jobs endpoint - return { - "jobs": [ - { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - else: # Single job endpoint - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - - # Update operations (PATCH/PUT) - if upper_method in ["PATCH", "PUT"] and "scheduled-jobs" in path: - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Updated Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 10 * * 1", - "job_config": {"mock": "updated_config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T01:00:00Z", - "next_run_at": "2024-01-08T10:00:00Z" - } - - # Delete operations - if upper_method == "DELETE" and "scheduled-jobs" in path: - return {"message": "Scheduled job deleted successfully"} - - # Generic fallback - return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} - - async def markdownify( - self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False - ): - """Send a markdownify request - - Args: - website_url: The URL to convert to markdown - headers: Optional HTTP headers - mock: Enable mock mode for testing - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Starting markdownify request for {website_url}") - if headers: - logger.debug("🔧 Using custom headers") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if render_heavy_js: - logger.debug("⚡ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/markdownify", json=request.model_dump() - ) - logger.info("✨ Markdownify request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_markdownify(self, request_id: str, return_toon: bool = False): - """Get the result of a previous markdownify request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching markdownify result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetMarkdownifyRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/markdownify/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # Scrape + # ------------------------------------------------------------------ async def scrape( self, - website_url: str, - render_heavy_js: bool = False, - branding: bool = False, - headers: Optional[dict[str, str]] = None, - stealth: bool = False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a scrape request to get HTML content from a website + url: str, + format: str = "markdown", + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Scrape a page and return it in the specified format. Args: - website_url: The URL of the website to get HTML from - render_heavy_js: Whether to render heavy JavaScript (defaults to False) - branding: Whether to include branding in the response (defaults to False) - headers: Optional headers to send with the request - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + url: URL to scrape + format: Output format - 'markdown', 'html', 'screenshot', or 'branding' + fetch_config: Fetch configuration options """ - logger.info(f"🔍 Starting scrape request for {website_url}") - logger.debug(f"🔧 Render heavy JS: {render_heavy_js}") - logger.debug(f"🔧 Branding: {branding}") - if headers: - logger.debug("🔧 Using custom headers") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - + logger.info(f"Scraping {url} (format={format})") request = ScrapeRequest( - website_url=website_url, - render_heavy_js=render_heavy_js, - branding=branding, - headers=headers, - stealth=stealth, - wait_ms=wait_ms, - ) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/scrape", json=request.model_dump() - ) - logger.info("✨ Scrape request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_scrape(self, request_id: str, return_toon: bool = False): - """Get the result of a previous scrape request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching scrape result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetScrapeRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/scrape/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - async def sitemap( - self, - website_url: str, - mock: bool = False, - ) -> SitemapResponse: - """Extract all URLs from a website's sitemap. - - Automatically discovers sitemap from robots.txt or common sitemap locations. - - Args: - website_url: The URL of the website to extract sitemap from - mock: Whether to use mock mode for this request - - Returns: - SitemapResponse: Object containing list of URLs extracted from sitemap - - Raises: - ValueError: If website_url is invalid - APIError: If the API request fails - - Examples: - >>> async with AsyncClient(api_key="your-api-key") as client: - ... response = await client.sitemap("https://example.com") - ... print(f"Found {len(response.urls)} URLs") - ... for url in response.urls[:5]: - ... print(url) - """ - logger.info(f"🗺️ Starting sitemap extraction for {website_url}") - - request = SitemapRequest( - website_url=website_url, - mock=mock + url=url, + format=ScrapeFormat(format), + fetch_config=fetch_config, ) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/sitemap", json=request.model_dump() + return await self._make_request( + "POST", f"{self.base_url}/scrape", json=request.model_dump() ) - logger.info(f"✨ Sitemap extraction completed successfully - found {len(result.get('urls', []))} URLs") - # Parse response into SitemapResponse model - return SitemapResponse(**result) + # ------------------------------------------------------------------ + # Extract (replaces SmartScraper) + # ------------------------------------------------------------------ - async def smartscraper( + async def extract( self, - user_prompt: str, - website_url: Optional[str] = None, - website_html: Optional[str] = None, - website_markdown: Optional[str] = None, - headers: Optional[dict[str, str]] = None, - cookies: Optional[Dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - number_of_scrolls: Optional[int] = None, - total_pages: Optional[int] = None, - mock: bool = False, - plain_text: bool = False, - render_heavy_js: bool = False, - stealth: bool = False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """ - Send a smartscraper request with optional pagination support and cookies. - - Supports three types of input (must provide exactly one): - - website_url: Scrape from a URL - - website_html: Process local HTML content - - website_markdown: Process local Markdown content - - Args: - user_prompt: Natural language prompt describing what to extract - website_url: URL to scrape (optional) - website_html: Raw HTML content to process (optional, max 2MB) - website_markdown: Markdown content to process (optional, max 2MB) - headers: Optional HTTP headers - cookies: Optional cookies for authentication - output_schema: Optional Pydantic model for structured output - number_of_scrolls: Number of times to scroll (0-100) - total_pages: Number of pages to scrape (1-10) - mock: Enable mock mode for testing - plain_text: Return plain text instead of structured data - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - - Returns: - Dictionary containing the scraping results, or TOON formatted string if return_toon=True - - Raises: - ValueError: If validation fails or invalid parameters provided - APIError: If the API request fails - """ - logger.info("🔍 Starting smartscraper request") - if website_url: - logger.debug(f"🌐 URL: {website_url}") - if website_html: - logger.debug("📄 Using provided HTML content") - if website_markdown: - logger.debug("📝 Using provided Markdown content") - if headers: - logger.debug("🔧 Using custom headers") - if cookies: - logger.debug("🍪 Using cookies for authentication/session management") - if number_of_scrolls is not None: - logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") - if total_pages is not None: - logger.debug(f"📄 Total pages to scrape: {total_pages}") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if render_heavy_js: - logger.debug("⚡ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - logger.debug(f"📝 Prompt: {user_prompt}") - - request = SmartScraperRequest( - website_url=website_url, - website_html=website_html, - website_markdown=website_markdown, - headers=headers, - cookies=cookies, - user_prompt=user_prompt, - output_schema=output_schema, - number_of_scrolls=number_of_scrolls, - total_pages=total_pages, - mock=mock, - plain_text=plain_text, - render_heavy_js=render_heavy_js, - stealth=stealth, - wait_ms=wait_ms, - ) - - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/smartscraper", json=request.model_dump() - ) - logger.info("✨ Smartscraper request completed successfully") - return process_response_with_toon(result, return_toon) + url: str, + prompt: str, + output_schema: Optional[Any] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Extract structured data from a page using AI. - async def get_smartscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous smartscraper request - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + url: URL to extract data from + prompt: Natural language prompt describing what to extract + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + fetch_config: Fetch configuration options + llm_config: LLM configuration options """ - logger.info(f"🔍 Fetching smartscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetSmartScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/smartscraper/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - async def submit_feedback( - self, request_id: str, rating: int, feedback_text: Optional[str] = None - ): - """Submit feedback for a request""" - logger.info(f"📝 Submitting feedback for request {request_id}") - logger.debug(f"⭐ Rating: {rating}, Feedback: {feedback_text}") - - feedback = FeedbackRequest( - request_id=request_id, rating=rating, feedback_text=feedback_text - ) - logger.debug("✅ Feedback validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/feedback", json=feedback.model_dump() - ) - logger.info("✨ Feedback submitted successfully") - return result - - async def get_credits(self): - """Get credits information""" - logger.info("💳 Fetching credits information") + logger.info(f"Extracting from {url}") + + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - result = await self._make_request( - "GET", - f"{API_BASE_URL}/credits", + request = ExtractRequest( + url=url, + prompt=prompt, + output_schema=schema_dict, + fetch_config=fetch_config, + llm_config=llm_config, ) - logger.info( - f"✨ Credits info retrieved: " - f"{result.get('remaining_credits')} credits remaining" + return await self._make_request( + "POST", f"{self.base_url}/extract", json=request.model_dump() ) - return result - - async def healthz(self): - """Check the health status of the service - - This endpoint is useful for monitoring and ensuring the service is operational. - It returns a JSON response indicating the service's health status. - - Returns: - dict: Health status information - - Example: - >>> async with AsyncClient.from_env() as client: - ... health = await client.healthz() - ... print(health) - """ - logger.info("🏥 Checking service health") - result = await self._make_request( - "GET", - f"{API_BASE_URL}/healthz", - ) - logger.info("✨ Health check completed successfully") - return result + # ------------------------------------------------------------------ + # Search (replaces SearchScraper) + # ------------------------------------------------------------------ - async def searchscraper( + async def search( self, - user_prompt: str, - num_results: Optional[int] = 3, - headers: Optional[dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - extraction_mode: bool = True, - stealth: bool = False, - location_geo_code: Optional[str] = None, - time_range: Optional[TimeRange] = None, - return_toon: bool = False, - ): - """Send a searchscraper request + query: str, + num_results: int = 5, + output_schema: Optional[Any] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Search the web and extract structured results. Args: - user_prompt: The search prompt string - num_results: Number of websites to scrape (3-20). Default is 3. - More websites provide better research depth but cost more - credits. Credit calculation: 30 base + 10 per additional - website beyond 3. - headers: Optional headers to send with the request - output_schema: Optional schema to structure the output - extraction_mode: Whether to use AI extraction (True) or markdown conversion (False). - AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. - stealth: Enable stealth mode to avoid bot detection - location_geo_code: Optional geo code of the location to search in (e.g., "us") - time_range: Optional time range filter for search results (e.g., TimeRange.PAST_WEEK) - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + query: The search query + num_results: Number of results (3-20, default 5) + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + llm_config: LLM configuration options """ - logger.info("🔍 Starting searchscraper request") - logger.debug(f"📝 Prompt: {user_prompt}") - logger.debug(f"🌐 Number of results: {num_results}") - logger.debug(f"🤖 Extraction mode: {'AI extraction' if extraction_mode else 'Markdown conversion'}") - if headers: - logger.debug("🔧 Using custom headers") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if location_geo_code: - logger.debug(f"🌍 Location geo code: {location_geo_code}") - if time_range: - logger.debug(f"📅 Time range: {time_range.value}") - if return_toon: - logger.debug("🎨 TOON format output enabled") + logger.info(f"Searching: {query}") + + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - request = SearchScraperRequest( - user_prompt=user_prompt, + request = SearchRequest( + query=query, num_results=num_results, - headers=headers, - output_schema=output_schema, - extraction_mode=extraction_mode, - stealth=stealth, - location_geo_code=location_geo_code, - time_range=time_range, + output_schema=schema_dict, + llm_config=llm_config, ) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/searchscraper", json=request.model_dump() + return await self._make_request( + "POST", f"{self.base_url}/search", json=request.model_dump() ) - logger.info("✨ Searchscraper request completed successfully") - return process_response_with_toon(result, return_toon) - async def get_searchscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous searchscraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching searchscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") + # ------------------------------------------------------------------ + # Credits + # ------------------------------------------------------------------ - # Validate input using Pydantic model - GetSearchScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") + async def credits(self) -> Dict[str, Any]: + """Get remaining API credits.""" + logger.info("Fetching credits") + return await self._make_request("GET", f"{self.base_url}/credits") - result = await self._make_request( - "GET", f"{API_BASE_URL}/searchscraper/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # History + # ------------------------------------------------------------------ - async def crawl( + async def history( self, - url: str, - prompt: Optional[str] = None, - data_schema: Optional[Dict[str, Any]] = None, - extraction_mode: bool = True, - cache_website: bool = True, - depth: int = 2, - breadth: Optional[int] = None, - max_pages: int = 2, - same_domain_only: bool = True, - batch_size: Optional[int] = None, - sitemap: bool = False, - headers: Optional[dict[str, str]] = None, - render_heavy_js: bool = False, - stealth: bool = False, - include_paths: Optional[list[str]] = None, - exclude_paths: Optional[list[str]] = None, - webhook_url: Optional[str] = None, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a crawl request with support for both AI extraction and - markdown conversion modes - - Args: - url: The starting URL to crawl - prompt: AI prompt for data extraction (required for AI extraction mode) - data_schema: Schema for structured output - extraction_mode: Whether to use AI extraction (True) or markdown (False) - cache_website: Whether to cache the website - depth: Maximum depth of link traversal - breadth: Maximum number of links to crawl per depth level. If None, unlimited (default). - Controls the 'width' of exploration at each depth. Useful for limiting crawl scope - on large sites. Note: max_pages always takes priority. Ignored when sitemap=True. - max_pages: Maximum number of pages to crawl - same_domain_only: Only crawl pages within the same domain - batch_size: Number of pages to process in batch - sitemap: Use sitemap for crawling - headers: Optional HTTP headers - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - include_paths: List of path patterns to include (e.g., ['/products/*', '/blog/**']) - Supports wildcards: * matches any characters, ** matches any path segments - exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) - Supports wildcards and takes precedence over include_paths - webhook_url: URL to receive webhook notifications when the crawl completes - wait_ms: Number of milliseconds to wait before scraping each page - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("🔍 Starting crawl request") - logger.debug(f"🌐 URL: {url}") - logger.debug( - f"🤖 Extraction mode: {'AI' if extraction_mode else 'Markdown conversion'}" - ) - if extraction_mode: - logger.debug(f"📝 Prompt: {prompt}") - logger.debug(f"📊 Schema provided: {bool(data_schema)}") - else: - logger.debug( - "📄 Markdown conversion mode - no AI processing, 2 credits per page" - ) - logger.debug(f"💾 Cache website: {cache_website}") - logger.debug(f"🔍 Depth: {depth}") - if breadth is not None: - logger.debug(f"📏 Breadth: {breadth}") - logger.debug(f"📄 Max pages: {max_pages}") - logger.debug(f"🏠 Same domain only: {same_domain_only}") - logger.debug(f"🗺️ Use sitemap: {sitemap}") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if render_heavy_js: - logger.debug("⚡ Heavy JavaScript rendering enabled") - if batch_size is not None: - logger.debug(f"📦 Batch size: {batch_size}") - if include_paths: - logger.debug(f"✅ Include paths: {include_paths}") - if exclude_paths: - logger.debug(f"❌ Exclude paths: {exclude_paths}") - if webhook_url: - logger.debug(f"🔔 Webhook URL: {webhook_url}") - if wait_ms is not None: - logger.debug(f"⏱️ Wait ms: {wait_ms}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Build request data, excluding None values - request_data = { - "url": url, - "extraction_mode": extraction_mode, - "cache_website": cache_website, - "depth": depth, - "max_pages": max_pages, - "same_domain_only": same_domain_only, - "sitemap": sitemap, - "render_heavy_js": render_heavy_js, - "stealth": stealth, - } - - # Add optional parameters only if provided - if prompt is not None: - request_data["prompt"] = prompt - if data_schema is not None: - request_data["data_schema"] = data_schema - if breadth is not None: - request_data["breadth"] = breadth - if batch_size is not None: - request_data["batch_size"] = batch_size - if headers is not None: - request_data["headers"] = headers - if include_paths is not None: - request_data["include_paths"] = include_paths - if exclude_paths is not None: - request_data["exclude_paths"] = exclude_paths - if webhook_url is not None: - request_data["webhook_url"] = webhook_url - if wait_ms is not None: - request_data["wait_ms"] = wait_ms - - request = CrawlRequest(**request_data) - logger.debug("✅ Request validation passed") - - # Serialize the request, excluding None values - request_json = request.model_dump(exclude_none=True) - result = await self._make_request( - "POST", f"{API_BASE_URL}/crawl", json=request_json - ) - logger.info("✨ Crawl request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_crawl(self, crawl_id: str, return_toon: bool = False): - """Get the result of a previous crawl request - - Args: - crawl_id: The crawl ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching crawl result for request {crawl_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetCrawlRequest(crawl_id=crawl_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request("GET", f"{API_BASE_URL}/crawl/{crawl_id}") - logger.info(f"✨ Successfully retrieved result for request {crawl_id}") - return process_response_with_toon(result, return_toon) - - async def agenticscraper( - self, - url: str, - steps: list[str], - use_session: bool = True, - user_prompt: Optional[str] = None, - output_schema: Optional[Dict[str, Any]] = None, - ai_extraction: bool = False, - stealth: bool = False, - return_toon: bool = False, - ): - """Send an agentic scraper request to perform automated actions on a webpage - - Args: - url: The URL to scrape - steps: List of steps to perform on the webpage - use_session: Whether to use session for the scraping (default: True) - user_prompt: Prompt for AI extraction (required when ai_extraction=True) - output_schema: Schema for structured data extraction (optional, used with ai_extraction=True) - ai_extraction: Whether to use AI for data extraction from the scraped content (default: False) - stealth: Enable stealth mode to avoid bot detection - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🤖 Starting agentic scraper request for {url}") - logger.debug(f"🔧 Use session: {use_session}") - logger.debug(f"📋 Steps: {steps}") - logger.debug(f"🧠 AI extraction: {ai_extraction}") - if ai_extraction: - logger.debug(f"💭 User prompt: {user_prompt}") - logger.debug(f"📋 Output schema provided: {output_schema is not None}") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = AgenticScraperRequest( - url=url, - steps=steps, - use_session=use_session, - user_prompt=user_prompt, - output_schema=output_schema, - ai_extraction=ai_extraction, - stealth=stealth, - ) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/agentic-scrapper", json=request.model_dump() - ) - logger.info("✨ Agentic scraper request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_agenticscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous agentic scraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching agentic scraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetAgenticScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request("GET", f"{API_BASE_URL}/agentic-scrapper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - async def generate_schema( - self, - user_prompt: str, - existing_schema: Optional[Dict[str, Any]] = None, - ): - """Generate a JSON schema from a user prompt - - Args: - user_prompt: The user's search query to be refined into a schema - existing_schema: Optional existing JSON schema to modify/extend - """ - logger.info("🔧 Starting schema generation request") - logger.debug(f"💭 User prompt: {user_prompt}") - if existing_schema: - logger.debug(f"📋 Existing schema provided: {existing_schema is not None}") - - request = GenerateSchemaRequest( - user_prompt=user_prompt, - existing_schema=existing_schema, - ) - logger.debug("✅ Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/generate_schema", json=request.model_dump() - ) - logger.info("✨ Schema generation request completed successfully") - return result + endpoint: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Dict[str, Any]: + """Retrieve request history. - async def get_schema_status(self, request_id: str): - """Get the result of a previous schema generation request - Args: - request_id: The request ID returned from generate_schema + endpoint: Filter by endpoint name (e.g. 'scrape', 'extract') + status: Filter by request status + limit: Maximum number of results (1-100) + offset: Number of results to skip """ - logger.info(f"🔍 Fetching schema generation status for request {request_id}") - - # Validate input using Pydantic model - GetSchemaStatusRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = await self._make_request("GET", f"{API_BASE_URL}/generate_schema/{request_id}") - logger.info(f"✨ Successfully retrieved schema status for request {request_id}") - return result - - async def create_scheduled_job( - self, - job_name: str, - service_type: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Create a new scheduled job""" - logger.info(f"📅 Creating scheduled job: {job_name}") - - request = ScheduledJobCreate( - job_name=job_name, - service_type=service_type, - cron_expression=cron_expression, - job_config=job_config, - is_active=is_active, - ) - - result = await self._make_request( - "POST", f"{API_BASE_URL}/scheduled-jobs", json=request.model_dump() - ) - logger.info("✨ Scheduled job created successfully") - return result - - async def get_scheduled_jobs( - self, - page: int = 1, - page_size: int = 20, - service_type: Optional[str] = None, - is_active: Optional[bool] = None, - ): - """Get list of scheduled jobs with pagination""" - logger.info("📋 Fetching scheduled jobs") - - GetScheduledJobsRequest( - page=page, - page_size=page_size, - service_type=service_type, - is_active=is_active, - ) - - params = {"page": page, "page_size": page_size} - if service_type: - params["service_type"] = service_type - if is_active is not None: - params["is_active"] = is_active - - result = await self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs", params=params) - logger.info(f"✨ Successfully retrieved {len(result.get('jobs', []))} scheduled jobs") - return result - - async def get_scheduled_job(self, job_id: str): - """Get details of a specific scheduled job""" - logger.info(f"🔍 Fetching scheduled job {job_id}") - - GetScheduledJobRequest(job_id=job_id) - - result = await self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully retrieved scheduled job {job_id}") - return result - - async def update_scheduled_job( - self, - job_id: str, - job_name: Optional[str] = None, - cron_expression: Optional[str] = None, - job_config: Optional[dict] = None, - is_active: Optional[bool] = None, - ): - """Update an existing scheduled job (partial update)""" - logger.info(f"📝 Updating scheduled job {job_id}") - - update_data = {} - if job_name is not None: - update_data["job_name"] = job_name - if cron_expression is not None: - update_data["cron_expression"] = cron_expression - if job_config is not None: - update_data["job_config"] = job_config - if is_active is not None: - update_data["is_active"] = is_active - - ScheduledJobUpdate(**update_data) - - result = await self._make_request( - "PATCH", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=update_data + logger.info("Fetching history") + filter_obj = HistoryFilter( + endpoint=endpoint, status=status, limit=limit, offset=offset ) - logger.info(f"✨ Successfully updated scheduled job {job_id}") - return result - - async def replace_scheduled_job( - self, - job_id: str, - job_name: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Replace an existing scheduled job (full update)""" - logger.info(f"🔄 Replacing scheduled job {job_id}") - - request_data = { - "job_name": job_name, - "cron_expression": cron_expression, - "job_config": job_config, - "is_active": is_active, - } - - result = await self._make_request( - "PUT", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=request_data + params = filter_obj.to_params() + return await self._make_request( + "GET", f"{self.base_url}/history", params=params or None ) - logger.info(f"✨ Successfully replaced scheduled job {job_id}") - return result - - async def delete_scheduled_job(self, job_id: str): - """Delete a scheduled job""" - logger.info(f"🗑️ Deleting scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = await self._make_request("DELETE", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully deleted scheduled job {job_id}") - return result - - async def pause_scheduled_job(self, job_id: str): - """Pause a scheduled job""" - logger.info(f"⏸️ Pausing scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - result = await self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/pause") - logger.info(f"✨ Successfully paused scheduled job {job_id}") - return result - - async def resume_scheduled_job(self, job_id: str): - """Resume a paused scheduled job""" - logger.info(f"▶️ Resuming scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = await self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/resume") - logger.info(f"✨ Successfully resumed scheduled job {job_id}") - return result - - async def trigger_scheduled_job(self, job_id: str): - """Manually trigger a scheduled job""" - logger.info(f"🚀 Manually triggering scheduled job {job_id}") - - TriggerJobRequest(job_id=job_id) - - result = await self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/trigger") - logger.info(f"✨ Successfully triggered scheduled job {job_id}") - return result - - async def get_job_executions( - self, - job_id: str, - page: int = 1, - page_size: int = 20, - status: Optional[str] = None, - ): - """Get execution history for a scheduled job""" - logger.info(f"📊 Fetching execution history for job {job_id}") - - GetJobExecutionsRequest( - job_id=job_id, - page=page, - page_size=page_size, - status=status, - ) - - params = {"page": page, "page_size": page_size} - if status: - params["status"] = status - - result = await self._make_request( - "GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}/executions", params=params - ) - logger.info(f"✨ Successfully retrieved execution history for job {job_id}") - return result + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ - async def close(self): - """Close the session to free up resources""" - logger.info("🔒 Closing AsyncClient session") + async def close(self) -> None: + """Close the HTTP session.""" + logger.info("Closing AsyncClient session") await self.session.close() - logger.debug("✅ Session closed successfully") - async def __aenter__(self): + async def __aenter__(self) -> "AsyncClient": return self - async def __aexit__(self, exc_type, exc_val, exc_tb): + async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: await self.close() diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 28fc1bf..48c688a 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -1,34 +1,21 @@ """ -Synchronous HTTP client for the ScrapeGraphAI API. - -This module provides a synchronous client for interacting with all ScrapeGraphAI -API endpoints including smartscraper, searchscraper, crawl, agentic scraper, -markdownify, schema generation, scheduled jobs, and utility functions. - -The Client class supports: -- API key authentication -- SSL verification configuration -- Request timeout configuration -- Automatic retry logic with exponential backoff -- Mock mode for testing -- Context manager support for proper resource cleanup +Synchronous HTTP client for the ScrapeGraphAI v2 API. Example: - Basic usage with environment variables: - >>> from scrapegraph_py import Client - >>> client = Client.from_env() - >>> result = client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract product information" - ... ) - - Using context manager: - >>> with Client(api_key="sgai-...") as client: - ... result = client.scrape(website_url="https://example.com") + >>> from scrapegraph_py import Client + >>> client = Client(api_key="sgai-...") + >>> result = client.extract( + ... url="https://example.com", + ... prompt="Extract product information" + ... ) + >>> print(result) + + >>> # Namespaced crawl/monitor + >>> job = client.crawl.start("https://example.com", depth=3) + >>> status = client.crawl.status(job["id"]) """ -import uuid as _uuid -from typing import Any, Callable, Dict, Optional -from urllib.parse import urlparse + +from typing import Any, Callable, Dict, List, Optional import requests import urllib3 @@ -38,71 +25,190 @@ from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger -from scrapegraph_py.models.agenticscraper import ( - AgenticScraperRequest, - GetAgenticScraperRequest, -) -from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest -from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from scrapegraph_py.models.schema import ( - GenerateSchemaRequest, - GetSchemaStatusRequest, - SchemaGenerationResponse, -) -from scrapegraph_py.models.scrape import GetScrapeRequest, ScrapeRequest -from scrapegraph_py.models.searchscraper import ( - GetSearchScraperRequest, - SearchScraperRequest, - TimeRange, -) -from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse -from scrapegraph_py.models.smartscraper import ( - GetSmartScraperRequest, - SmartScraperRequest, -) -from scrapegraph_py.models.scheduled_jobs import ( - GetJobExecutionsRequest, - GetScheduledJobRequest, - GetScheduledJobsRequest, - JobActionRequest, - JobActionResponse, - JobExecutionListResponse, - JobTriggerResponse, - ScheduledJobCreate, - ScheduledJobListResponse, - ScheduledJobResponse, - ScheduledJobUpdate, - TriggerJobRequest, -) +from scrapegraph_py.models.crawl import CrawlFormat, CrawlRequest +from scrapegraph_py.models.extract import ExtractRequest +from scrapegraph_py.models.history import HistoryFilter +from scrapegraph_py.models.monitor import MonitorCreateRequest +from scrapegraph_py.models.scrape import ScrapeFormat, ScrapeRequest +from scrapegraph_py.models.search import SearchRequest +from scrapegraph_py.models.shared import FetchConfig, LlmConfig from scrapegraph_py.utils.helpers import handle_sync_response, validate_api_key -from scrapegraph_py.utils.toon_converter import process_response_with_toon -class Client: - """ - Synchronous client for the ScrapeGraphAI API. +class _CrawlNamespace: + """Namespaced crawl operations: client.crawl.start(), .status(), .stop(), .resume().""" + + def __init__(self, client: "Client"): + self._client = client + + def start( + self, + url: str, + depth: int = 2, + max_pages: int = 10, + format: str = "markdown", + include_patterns: Optional[List[str]] = None, + exclude_patterns: Optional[List[str]] = None, + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Start a crawl job. + + Args: + url: The starting URL for the crawl + depth: Maximum crawl depth (1-10) + max_pages: Maximum pages to crawl (1-100) + format: Output format - 'markdown' or 'html' + include_patterns: URL patterns to include + exclude_patterns: URL patterns to exclude + fetch_config: Fetch configuration options + """ + logger.info(f"Starting crawl for {url}") + request = CrawlRequest( + url=url, + depth=depth, + max_pages=max_pages, + format=CrawlFormat(format), + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + fetch_config=fetch_config, + ) + return self._client._make_request( + "POST", f"{self._client.base_url}/crawl", json=request.model_dump() + ) + + def status(self, crawl_id: str) -> Dict[str, Any]: + """Get crawl job status and results. + + Args: + crawl_id: The crawl job ID + """ + logger.info(f"Fetching crawl status for {crawl_id}") + return self._client._make_request("GET", f"{self._client.base_url}/crawl/{crawl_id}") + + def stop(self, crawl_id: str) -> Dict[str, Any]: + """Stop a running crawl job. + + Args: + crawl_id: The crawl job ID to stop + """ + logger.info(f"Stopping crawl {crawl_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/stop" + ) + + def resume(self, crawl_id: str) -> Dict[str, Any]: + """Resume a stopped crawl job. + + Args: + crawl_id: The crawl job ID to resume + """ + logger.info(f"Resuming crawl {crawl_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/resume" + ) + + +class _MonitorNamespace: + """Namespaced monitor operations: client.monitor.create(), .list(), .get(), etc.""" + + def __init__(self, client: "Client"): + self._client = client + + def create( + self, + name: str, + url: str, + prompt: str, + cron: str, + output_schema: Optional[Dict[str, Any]] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Create a new monitor. + + Args: + name: Name of the monitor + url: URL to monitor + prompt: Prompt for AI extraction + cron: Cron expression (5 fields) + output_schema: Optional JSON Schema for structured output + fetch_config: Fetch configuration options + llm_config: LLM configuration options + """ + logger.info(f"Creating monitor '{name}' for {url}") + request = MonitorCreateRequest( + name=name, + url=url, + prompt=prompt, + cron=cron, + output_schema=output_schema, + fetch_config=fetch_config, + llm_config=llm_config, + ) + return self._client._make_request( + "POST", f"{self._client.base_url}/monitor", json=request.model_dump() + ) + + def list(self) -> Dict[str, Any]: + """List all monitors.""" + logger.info("Listing monitors") + return self._client._make_request("GET", f"{self._client.base_url}/monitor") + + def get(self, monitor_id: str) -> Dict[str, Any]: + """Get a specific monitor. + + Args: + monitor_id: The monitor ID + """ + logger.info(f"Fetching monitor {monitor_id}") + return self._client._make_request( + "GET", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + def pause(self, monitor_id: str) -> Dict[str, Any]: + """Pause a monitor. + + Args: + monitor_id: The monitor ID to pause + """ + logger.info(f"Pausing monitor {monitor_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/pause" + ) + + def resume(self, monitor_id: str) -> Dict[str, Any]: + """Resume a paused monitor. + + Args: + monitor_id: The monitor ID to resume + """ + logger.info(f"Resuming monitor {monitor_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/resume" + ) + + def delete(self, monitor_id: str) -> Dict[str, Any]: + """Delete a monitor. + + Args: + monitor_id: The monitor ID to delete + """ + logger.info(f"Deleting monitor {monitor_id}") + return self._client._make_request( + "DELETE", f"{self._client.base_url}/monitor/{monitor_id}" + ) - This class provides synchronous methods for all ScrapeGraphAI API endpoints. - It handles authentication, request management, error handling, and supports - mock mode for testing. - Attributes: - api_key (str): The API key for authentication - headers (dict): Default headers including API key - timeout (Optional[float]): Request timeout in seconds - max_retries (int): Maximum number of retry attempts - retry_delay (float): Delay between retries in seconds - mock (bool): Whether mock mode is enabled - session (requests.Session): HTTP session for connection pooling +class Client: + """Synchronous client for the ScrapeGraphAI v2 API. Example: - >>> client = Client.from_env() - >>> result = client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract all products" - ... ) + >>> client = Client(api_key="sgai-...") + >>> result = client.scrape("https://example.com") + >>> result = client.extract("https://example.com", prompt="Extract prices") + >>> job = client.crawl.start("https://example.com", depth=3) """ + @classmethod def from_env( cls, @@ -110,75 +216,42 @@ def from_env( timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: Optional[bool] = None, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, - ): - """Initialize Client using API key from environment variable. - - Args: - verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds - mock: If True, the client will not perform real HTTP requests and - will return stubbed responses. If None, reads from SGAI_MOCK env. - """ + ) -> "Client": + """Initialize Client using SGAI_API_KEY environment variable.""" from os import getenv - # Allow enabling mock mode from environment if not explicitly provided - if mock is None: - mock_env = getenv("SGAI_MOCK", "0").strip().lower() - mock = mock_env in {"1", "true", "yes", "on"} - api_key = getenv("SGAI_API_KEY") - # In mock mode, we don't need a real API key if not api_key: - if mock: - api_key = "sgai-00000000-0000-0000-0000-000000000000" - else: - raise ValueError("SGAI_API_KEY environment variable not set") + raise ValueError("SGAI_API_KEY environment variable not set") return cls( api_key=api_key, verify_ssl=verify_ssl, timeout=timeout, max_retries=max_retries, retry_delay=retry_delay, - mock=bool(mock), - mock_handler=mock_handler, - mock_responses=mock_responses, ) def __init__( self, api_key: str = None, + base_url: Optional[str] = None, verify_ssl: bool = True, timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: bool = False, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, ): - """Initialize Client with configurable parameters. + """Initialize Client. Args: - api_key: API key for authentication. If None, will try to load - from environment + api_key: API key for authentication. If None, reads from SGAI_API_KEY env var + base_url: Override the default API base URL verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds - mock: If True, the client will bypass HTTP calls and return - deterministic mock responses - mock_handler: Optional callable to generate custom mock responses - given (method, url, request_kwargs) - mock_responses: Optional mapping of path (e.g. "/v1/credits") to - static response or callable returning a response + timeout: Request timeout in seconds (None = no timeout) + max_retries: Maximum retry attempts on server errors + retry_delay: Base delay between retries in seconds """ - logger.info("🔑 Initializing Client") + logger.info("Initializing Client") - # Try to get API key from environment if not provided if api_key is None: from os import getenv @@ -189,28 +262,25 @@ def __init__( ) validate_api_key(api_key) - logger.debug( - f"🛠️ Configuration: verify_ssl={verify_ssl}, timeout={timeout}, " - f"max_retries={max_retries}" - ) self.api_key = api_key - self.headers = {**DEFAULT_HEADERS, "SGAI-APIKEY": api_key} + self.base_url = (base_url or API_BASE_URL).rstrip("/") + self.headers = { + **DEFAULT_HEADERS, + "Authorization": f"Bearer {api_key}", + "SGAI-APIKEY": api_key, + } self.timeout = timeout self.max_retries = max_retries self.retry_delay = retry_delay - self.mock = bool(mock) - self.mock_handler = mock_handler - self.mock_responses = mock_responses or {} - # Create a session for connection pooling + # HTTP session with connection pooling and retry self.session = requests.Session() self.session.headers.update(self.headers) self.session.verify = verify_ssl - # Configure retries adapter = requests.adapters.HTTPAdapter( - max_retries=requests.urllib3.Retry( + max_retries=urllib3.Retry( total=max_retries, backoff_factor=retry_delay, status_forcelist=[500, 502, 503, 504], @@ -219,1117 +289,198 @@ def __init__( self.session.mount("http://", adapter) self.session.mount("https://", adapter) - # Add warning suppression if verify_ssl is False if not verify_ssl: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - logger.info("✅ Client initialized successfully") - - def _make_request(self, method: str, url: str, **kwargs) -> Any: - """ - Make HTTP request with error handling and retry logic. - - Args: - method: HTTP method (GET, POST, etc.) - url: Full URL for the request - **kwargs: Additional arguments to pass to requests - - Returns: - Parsed JSON response data + # Namespaced sub-clients + self.crawl = _CrawlNamespace(self) + self.monitor = _MonitorNamespace(self) - Raises: - APIError: If the API returns an error response - ConnectionError: If unable to connect to the API + logger.info("Client initialized successfully") - Note: - In mock mode, this method returns deterministic responses without - making actual HTTP requests. - """ - # Short-circuit when mock mode is enabled - if getattr(self, "mock", False): - return self._mock_response(method, url, **kwargs) + def _make_request(self, method: str, url: str, **kwargs: Any) -> Any: + """Make HTTP request with error handling.""" try: - logger.info(f"🚀 Making {method} request to {url}") - logger.debug(f"🔍 Request parameters: {kwargs}") - + logger.debug(f"Making {method} request to {url}") response = self.session.request(method, url, timeout=self.timeout, **kwargs) - logger.debug(f"📥 Response status: {response.status_code}") - - result = handle_sync_response(response) - logger.info(f"✅ Request completed successfully: {method} {url}") - return result - + return handle_sync_response(response) except RequestException as e: - logger.error(f"❌ Request failed: {str(e)}") + logger.error(f"Request failed: {e}") if hasattr(e, "response") and e.response is not None: try: error_data = e.response.json() error_msg = error_data.get("error", str(e)) - logger.error(f"🔴 API Error: {error_msg}") raise APIError(error_msg, status_code=e.response.status_code) except ValueError: - logger.error("🔴 Could not parse error response") raise APIError( str(e), - status_code=( - e.response.status_code - if hasattr(e.response, "status_code") - else None - ), + status_code=getattr(e.response, "status_code", None), ) - logger.error(f"🔴 Connection Error: {str(e)}") - raise ConnectionError(f"Failed to connect to API: {str(e)}") + raise ConnectionError(f"Failed to connect to API: {e}") - def _mock_response(self, method: str, url: str, **kwargs) -> Any: - """Return a deterministic mock response without performing network I/O. - - Resolution order: - 1) If a custom mock_handler is provided, delegate to it - 2) If mock_responses contains a key for the request path, use it - 3) Fallback to built-in defaults per endpoint family - """ - logger.info(f"🧪 Mock mode active. Returning stub for {method} {url}") - - # 1) Custom handler - if self.mock_handler is not None: - try: - return self.mock_handler(method, url, kwargs) - except Exception as handler_error: - logger.warning(f"Custom mock_handler raised: {handler_error}. Falling back to defaults.") - - # 2) Path-based override - try: - parsed = urlparse(url) - path = parsed.path.rstrip("/") - except Exception: - path = url - - override = self.mock_responses.get(path) - if override is not None: - return override() if callable(override) else override - - # 3) Built-in defaults - def new_id(prefix: str) -> str: - return f"{prefix}-{_uuid.uuid4()}" - - upper_method = method.upper() - - # Credits endpoint - if path.endswith("/credits") and upper_method == "GET": - return {"remaining_credits": 1000, "total_credits_used": 0} - - # Health check endpoint - if path.endswith("/healthz") and upper_method == "GET": - return {"status": "healthy", "message": "Service is operational"} - - # Feedback acknowledge - if path.endswith("/feedback") and upper_method == "POST": - return {"status": "success"} - - # Create-like endpoints (POST) - if upper_method == "POST": - if path.endswith("/crawl"): - return {"crawl_id": new_id("mock-crawl")} - elif path.endswith("/scheduled-jobs"): - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/pause" in path: - return { - "message": "Job paused successfully", - "job_id": new_id("mock-job"), - "is_active": False - } - elif "/resume" in path: - return { - "message": "Job resumed successfully", - "job_id": new_id("mock-job"), - "is_active": True, - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/trigger" in path: - return { - "execution_id": new_id("mock-task"), - "scheduled_job_id": new_id("mock-job"), - "triggered_at": "2024-01-01T00:00:00Z", - "message": f"Job triggered successfully. Task ID: {new_id('mock-task')}" - } - # All other POST endpoints return a request id - return {"request_id": new_id("mock-req")} - - # Status-like endpoints (GET) - if upper_method == "GET": - if "markdownify" in path: - return {"status": "completed", "content": "# Mock markdown\n\n..."} - if "smartscraper" in path: - return {"status": "completed", "result": [{"field": "value"}]} - if "searchscraper" in path: - return { - "status": "completed", - "results": [{"url": "https://example.com"}], - "markdown_content": "# Mock Markdown Content\n\nThis is mock markdown content for testing purposes.\n\n## Section 1\n\nSome content here.\n\n## Section 2\n\nMore content here.", - "reference_urls": ["https://example.com", "https://example2.com"] - } - if "crawl" in path: - return {"status": "completed", "pages": []} - if "agentic-scrapper" in path: - return {"status": "completed", "actions": []} - if "scheduled-jobs" in path: - if "/executions" in path: - return { - "executions": [ - { - "id": new_id("mock-exec"), - "scheduled_job_id": new_id("mock-job"), - "execution_id": new_id("mock-task"), - "status": "completed", - "started_at": "2024-01-01T00:00:00Z", - "completed_at": "2024-01-01T00:01:00Z", - "result": {"mock": "result"}, - "credits_used": 10 - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - elif path.endswith("/scheduled-jobs"): # List jobs endpoint - return { - "jobs": [ - { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - else: # Single job endpoint - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - - # Update operations (PATCH/PUT) - if upper_method in ["PATCH", "PUT"] and "scheduled-jobs" in path: - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Updated Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 10 * * 1", - "job_config": {"mock": "updated_config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T01:00:00Z", - "next_run_at": "2024-01-08T10:00:00Z" - } - - # Delete operations - if upper_method == "DELETE" and "scheduled-jobs" in path: - return {"message": "Scheduled job deleted successfully"} - - # Generic fallback - return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} - - def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False): - """Send a markdownify request - - Args: - website_url: The URL to convert to markdown - headers: Optional HTTP headers - mock: Enable mock mode for testing - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Starting markdownify request for {website_url}") - if headers: - logger.debug("🔧 Using custom headers") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if render_heavy_js: - logger.debug("⚡ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/markdownify", json=request.model_dump() - ) - logger.info("✨ Markdownify request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_markdownify(self, request_id: str, return_toon: bool = False): - """Get the result of a previous markdownify request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching markdownify result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetMarkdownifyRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/markdownify/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # Scrape + # ------------------------------------------------------------------ def scrape( self, - website_url: str, - render_heavy_js: bool = False, - branding: bool = False, - headers: Optional[dict[str, str]] = None, - mock:bool=False, - stealth:bool=False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a scrape request to get HTML content from a website + url: str, + format: str = "markdown", + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Scrape a page and return it in the specified format. Args: - website_url: The URL of the website to get HTML from - render_heavy_js: Whether to render heavy JavaScript (defaults to False) - branding: Whether to include branding in the response (defaults to False) - headers: Optional headers to send with the request - mock: Enable mock mode for testing - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + url: URL to scrape + format: Output format - 'markdown', 'html', 'screenshot', or 'branding' + fetch_config: Fetch configuration options """ - logger.info(f"🔍 Starting scrape request for {website_url}") - logger.debug(f"🔧 Render heavy JS: {render_heavy_js}") - logger.debug(f"🔧 Branding: {branding}") - if headers: - logger.debug("🔧 Using custom headers") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - + logger.info(f"Scraping {url} (format={format})") request = ScrapeRequest( - website_url=website_url, - render_heavy_js=render_heavy_js, - branding=branding, - headers=headers, - mock=mock, - stealth=stealth, - wait_ms=wait_ms, + url=url, + format=ScrapeFormat(format), + fetch_config=fetch_config, ) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/scrape", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/scrape", json=request.model_dump() ) - logger.info("✨ Scrape request completed successfully") - return process_response_with_toon(result, return_toon) - def get_scrape(self, request_id: str, return_toon: bool = False): - """Get the result of a previous scrape request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching scrape result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetScrapeRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") + # ------------------------------------------------------------------ + # Extract (replaces SmartScraper) + # ------------------------------------------------------------------ - result = self._make_request("GET", f"{API_BASE_URL}/scrape/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - def sitemap( + def extract( self, - website_url: str, - mock: bool = False, - ) -> SitemapResponse: - """Extract all URLs from a website's sitemap. - - Automatically discovers sitemap from robots.txt or common sitemap locations. + url: str, + prompt: str, + output_schema: Optional[Any] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Extract structured data from a page using AI. Args: - website_url: The URL of the website to extract sitemap from - mock: Whether to use mock mode for this request - - Returns: - SitemapResponse: Object containing list of URLs extracted from sitemap - - Raises: - ValueError: If website_url is invalid - APIError: If the API request fails - - Examples: - >>> client = Client(api_key="your-api-key") - >>> response = client.sitemap("https://example.com") - >>> print(f"Found {len(response.urls)} URLs") - >>> for url in response.urls[:5]: - ... print(url) + url: URL to extract data from + prompt: Natural language prompt describing what to extract + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + fetch_config: Fetch configuration options + llm_config: LLM configuration options """ - logger.info(f"🗺️ Starting sitemap extraction for {website_url}") + logger.info(f"Extracting from {url}") + + # Convert Pydantic model class to JSON schema + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - request = SitemapRequest( - website_url=website_url, - mock=mock + request = ExtractRequest( + url=url, + prompt=prompt, + output_schema=schema_dict, + fetch_config=fetch_config, + llm_config=llm_config, ) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/sitemap", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/extract", json=request.model_dump() ) - logger.info(f"✨ Sitemap extraction completed successfully - found {len(result.get('urls', []))} URLs") - # Parse response into SitemapResponse model - return SitemapResponse(**result) + # ------------------------------------------------------------------ + # Search (replaces SearchScraper) + # ------------------------------------------------------------------ - def smartscraper( + def search( self, - user_prompt: str, - website_url: Optional[str] = None, - website_html: Optional[str] = None, - website_markdown: Optional[str] = None, - headers: Optional[dict[str, str]] = None, - cookies: Optional[Dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - number_of_scrolls: Optional[int] = None, - total_pages: Optional[int] = None, - mock: bool = False, - plain_text: bool = False, - render_heavy_js: bool = False, - stealth: bool = False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """ - Send a smartscraper request with optional pagination support and cookies. - - Supports three types of input (must provide exactly one): - - website_url: Scrape from a URL - - website_html: Process local HTML content - - website_markdown: Process local Markdown content - - Args: - user_prompt: Natural language prompt describing what to extract - website_url: URL to scrape (optional) - website_html: Raw HTML content to process (optional, max 2MB) - website_markdown: Markdown content to process (optional, max 2MB) - headers: Optional HTTP headers - cookies: Optional cookies for authentication - output_schema: Optional Pydantic model for structured output - number_of_scrolls: Number of times to scroll (0-100) - total_pages: Number of pages to scrape (1-10) - mock: Enable mock mode for testing - plain_text: Return plain text instead of structured data - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - - Returns: - Dictionary containing the scraping results, or TOON formatted string if return_toon=True - - Raises: - ValueError: If validation fails or invalid parameters provided - APIError: If the API request fails - """ - logger.info("🔍 Starting smartscraper request") - if website_url: - logger.debug(f"🌐 URL: {website_url}") - if website_html: - logger.debug("📄 Using provided HTML content") - if website_markdown: - logger.debug("📝 Using provided Markdown content") - if headers: - logger.debug("🔧 Using custom headers") - if cookies: - logger.debug("🍪 Using cookies for authentication/session management") - if number_of_scrolls is not None: - logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") - if total_pages is not None: - logger.debug(f"📄 Total pages to scrape: {total_pages}") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if render_heavy_js: - logger.debug("⚡ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - logger.debug(f"📝 Prompt: {user_prompt}") - - request = SmartScraperRequest( - website_url=website_url, - website_html=website_html, - website_markdown=website_markdown, - headers=headers, - cookies=cookies, - user_prompt=user_prompt, - output_schema=output_schema, - number_of_scrolls=number_of_scrolls, - total_pages=total_pages, - mock=mock, - plain_text=plain_text, - render_heavy_js=render_heavy_js, - stealth=stealth, - wait_ms=wait_ms, - ) - logger.debug("✅ Request validation passed") + query: str, + num_results: int = 5, + output_schema: Optional[Any] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Search the web and extract structured results. - result = self._make_request( - "POST", f"{API_BASE_URL}/smartscraper", json=request.model_dump() - ) - logger.info("✨ Smartscraper request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_smartscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous smartscraper request - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching smartscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetSmartScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/smartscraper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - def submit_feedback( - self, request_id: str, rating: int, feedback_text: Optional[str] = None - ): - """Submit feedback for a request""" - logger.info(f"📝 Submitting feedback for request {request_id}") - logger.debug(f"⭐ Rating: {rating}, Feedback: {feedback_text}") - - feedback = FeedbackRequest( - request_id=request_id, rating=rating, feedback_text=feedback_text - ) - logger.debug("✅ Feedback validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/feedback", json=feedback.model_dump() - ) - logger.info("✨ Feedback submitted successfully") - return result - - def get_credits(self): - """Get credits information""" - logger.info("💳 Fetching credits information") - - result = self._make_request( - "GET", - f"{API_BASE_URL}/credits", - ) - logger.info( - f"✨ Credits info retrieved: {result.get('remaining_credits')} " - f"credits remaining" - ) - return result - - def healthz(self): - """Check the health status of the service - - This endpoint is useful for monitoring and ensuring the service is operational. - It returns a JSON response indicating the service's health status. - - Returns: - dict: Health status information - - Example: - >>> client = Client.from_env() - >>> health = client.healthz() - >>> print(health) + query: The search query + num_results: Number of results (3-20, default 5) + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + llm_config: LLM configuration options """ - logger.info("🏥 Checking service health") - - result = self._make_request( - "GET", - f"{API_BASE_URL}/healthz", - ) - logger.info("✨ Health check completed successfully") - return result - - def searchscraper( - self, - user_prompt: str, - num_results: Optional[int] = 3, - headers: Optional[dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - extraction_mode: bool = True, - mock: bool = False, - stealth: bool = False, - location_geo_code: Optional[str] = None, - time_range: Optional[TimeRange] = None, - return_toon: bool = False, - ): - """Send a searchscraper request + logger.info(f"Searching: {query}") + + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - Args: - user_prompt: The search prompt string - num_results: Number of websites to scrape (3-20). Default is 3. - More websites provide better research depth but cost more - credits. Credit calculation: 30 base + 10 per additional - website beyond 3. - headers: Optional headers to send with the request - output_schema: Optional schema to structure the output - extraction_mode: Whether to use AI extraction (True) or markdown conversion (False). - AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. - mock: Enable mock mode for testing - stealth: Enable stealth mode to avoid bot detection - location_geo_code: Optional geo code of the location to search in (e.g., "us") - time_range: Optional time range filter for search results (e.g., TimeRange.PAST_WEEK) - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("🔍 Starting searchscraper request") - logger.debug(f"📝 Prompt: {user_prompt}") - logger.debug(f"🌐 Number of results: {num_results}") - logger.debug(f"🤖 Extraction mode: {'AI extraction' if extraction_mode else 'Markdown conversion'}") - if headers: - logger.debug("🔧 Using custom headers") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if location_geo_code: - logger.debug(f"🌍 Location geo code: {location_geo_code}") - if time_range: - logger.debug(f"📅 Time range: {time_range.value}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = SearchScraperRequest( - user_prompt=user_prompt, + request = SearchRequest( + query=query, num_results=num_results, - headers=headers, - output_schema=output_schema, - extraction_mode=extraction_mode, - mock=mock, - stealth=stealth, - location_geo_code=location_geo_code, - time_range=time_range, - ) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/searchscraper", json=request.model_dump() - ) - logger.info("✨ Searchscraper request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_searchscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous searchscraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching searchscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetSearchScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/searchscraper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - def crawl( - self, - url: str, - prompt: Optional[str] = None, - data_schema: Optional[Dict[str, Any]] = None, - extraction_mode: bool = True, - cache_website: bool = True, - depth: int = 2, - breadth: Optional[int] = None, - max_pages: int = 2, - same_domain_only: bool = True, - batch_size: Optional[int] = None, - sitemap: bool = False, - headers: Optional[dict[str, str]] = None, - render_heavy_js: bool = False, - stealth: bool = False, - include_paths: Optional[list[str]] = None, - exclude_paths: Optional[list[str]] = None, - webhook_url: Optional[str] = None, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a crawl request with support for both AI extraction and - markdown conversion modes - - Args: - url: The starting URL to crawl - prompt: AI prompt for data extraction (required for AI extraction mode) - data_schema: Schema for structured output - extraction_mode: Whether to use AI extraction (True) or markdown (False) - cache_website: Whether to cache the website - depth: Maximum depth of link traversal - breadth: Maximum number of links to crawl per depth level. If None, unlimited (default). - Controls the 'width' of exploration at each depth. Useful for limiting crawl scope - on large sites. Note: max_pages always takes priority. Ignored when sitemap=True. - max_pages: Maximum number of pages to crawl - same_domain_only: Only crawl pages within the same domain - batch_size: Number of pages to process in batch - sitemap: Use sitemap for crawling - headers: Optional HTTP headers - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - include_paths: List of path patterns to include (e.g., ['/products/*', '/blog/**']) - Supports wildcards: * matches any characters, ** matches any path segments - exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) - Supports wildcards and takes precedence over include_paths - webhook_url: URL to receive webhook notifications when the crawl completes - wait_ms: Number of milliseconds to wait before scraping each page - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("🔍 Starting crawl request") - logger.debug(f"🌐 URL: {url}") - logger.debug( - f"🤖 Extraction mode: {'AI' if extraction_mode else 'Markdown conversion'}" - ) - if extraction_mode: - logger.debug(f"📝 Prompt: {prompt}") - logger.debug(f"📊 Schema provided: {bool(data_schema)}") - else: - logger.debug( - "📄 Markdown conversion mode - no AI processing, 2 credits per page" - ) - logger.debug(f"💾 Cache website: {cache_website}") - logger.debug(f"🔍 Depth: {depth}") - if breadth is not None: - logger.debug(f"📏 Breadth: {breadth}") - logger.debug(f"📄 Max pages: {max_pages}") - logger.debug(f"🏠 Same domain only: {same_domain_only}") - logger.debug(f"🗺️ Use sitemap: {sitemap}") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if render_heavy_js: - logger.debug("⚡ Heavy JavaScript rendering enabled") - if batch_size is not None: - logger.debug(f"📦 Batch size: {batch_size}") - if include_paths: - logger.debug(f"✅ Include paths: {include_paths}") - if exclude_paths: - logger.debug(f"❌ Exclude paths: {exclude_paths}") - if webhook_url: - logger.debug(f"🔔 Webhook URL: {webhook_url}") - if wait_ms is not None: - logger.debug(f"⏱️ Wait ms: {wait_ms}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Build request data, excluding None values - request_data = { - "url": url, - "extraction_mode": extraction_mode, - "cache_website": cache_website, - "depth": depth, - "max_pages": max_pages, - "same_domain_only": same_domain_only, - "sitemap": sitemap, - "render_heavy_js": render_heavy_js, - "stealth": stealth, - } - - # Add optional parameters only if provided - if prompt is not None: - request_data["prompt"] = prompt - if data_schema is not None: - request_data["data_schema"] = data_schema - if breadth is not None: - request_data["breadth"] = breadth - if batch_size is not None: - request_data["batch_size"] = batch_size - if headers is not None: - request_data["headers"] = headers - if include_paths is not None: - request_data["include_paths"] = include_paths - if exclude_paths is not None: - request_data["exclude_paths"] = exclude_paths - if webhook_url is not None: - request_data["webhook_url"] = webhook_url - if wait_ms is not None: - request_data["wait_ms"] = wait_ms - - request = CrawlRequest(**request_data) - logger.debug("✅ Request validation passed") - - # Serialize the request, excluding None values - request_json = request.model_dump(exclude_none=True) - result = self._make_request("POST", f"{API_BASE_URL}/crawl", json=request_json) - logger.info("✨ Crawl request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_crawl(self, crawl_id: str, return_toon: bool = False): - """Get the result of a previous crawl request - - Args: - crawl_id: The crawl ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching crawl result for request {crawl_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetCrawlRequest(crawl_id=crawl_id) - logger.debug("✅ Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/crawl/{crawl_id}") - logger.info(f"✨ Successfully retrieved result for request {crawl_id}") - return process_response_with_toon(result, return_toon) - - def agenticscraper( - self, - url: str, - steps: list[str], - use_session: bool = True, - user_prompt: Optional[str] = None, - output_schema: Optional[Dict[str, Any]] = None, - ai_extraction: bool = False, - mock: bool=False, - stealth: bool=False, - return_toon: bool = False, - ): - """Send an agentic scraper request to perform automated actions on a webpage - - Args: - url: The URL to scrape - steps: List of steps to perform on the webpage - use_session: Whether to use session for the scraping (default: True) - user_prompt: Prompt for AI extraction (required when ai_extraction=True) - output_schema: Schema for structured data extraction (optional, used with ai_extraction=True) - ai_extraction: Whether to use AI for data extraction from the scraped content (default: False) - mock: Enable mock mode for testing - stealth: Enable stealth mode to avoid bot detection - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🤖 Starting agentic scraper request for {url}") - logger.debug(f"🔧 Use session: {use_session}") - logger.debug(f"📋 Steps: {steps}") - logger.debug(f"🧠 AI extraction: {ai_extraction}") - if ai_extraction: - logger.debug(f"💭 User prompt: {user_prompt}") - logger.debug(f"📋 Output schema provided: {output_schema is not None}") - if stealth: - logger.debug("🥷 Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = AgenticScraperRequest( - url=url, - steps=steps, - use_session=use_session, - user_prompt=user_prompt, - output_schema=output_schema, - ai_extraction=ai_extraction, - mock=mock, - stealth=stealth + output_schema=schema_dict, + llm_config=llm_config, ) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/agentic-scrapper", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/search", json=request.model_dump() ) - logger.info("✨ Agentic scraper request completed successfully") - return process_response_with_toon(result, return_toon) - def get_agenticscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous agentic scraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"🔍 Fetching agentic scraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") + # ------------------------------------------------------------------ + # Credits + # ------------------------------------------------------------------ - # Validate input using Pydantic model - GetAgenticScraperRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") + def credits(self) -> Dict[str, Any]: + """Get remaining API credits.""" + logger.info("Fetching credits") + return self._make_request("GET", f"{self.base_url}/credits") - result = self._make_request("GET", f"{API_BASE_URL}/agentic-scrapper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # History + # ------------------------------------------------------------------ - def generate_schema( + def history( self, - user_prompt: str, - existing_schema: Optional[Dict[str, Any]] = None, - ): - """Generate a JSON schema from a user prompt - - Args: - user_prompt: The user's search query to be refined into a schema - existing_schema: Optional existing JSON schema to modify/extend - """ - logger.info("🔧 Starting schema generation request") - logger.debug(f"💭 User prompt: {user_prompt}") - if existing_schema: - logger.debug(f"📋 Existing schema provided: {existing_schema is not None}") - - request = GenerateSchemaRequest( - user_prompt=user_prompt, - existing_schema=existing_schema, - ) - logger.debug("✅ Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/generate_schema", json=request.model_dump() - ) - logger.info("✨ Schema generation request completed successfully") - return result + endpoint: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Dict[str, Any]: + """Retrieve request history. - def get_schema_status(self, request_id: str): - """Get the status of a schema generation request - Args: - request_id: The request ID returned from generate_schema + endpoint: Filter by endpoint name (e.g. 'scrape', 'extract') + status: Filter by request status + limit: Maximum number of results (1-100) + offset: Number of results to skip """ - logger.info(f"🔍 Fetching schema generation status for request {request_id}") - - # Validate input using Pydantic model - GetSchemaStatusRequest(request_id=request_id) - logger.debug("✅ Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/generate_schema/{request_id}") - logger.info(f"✨ Successfully retrieved schema status for request {request_id}") - return result - - def create_scheduled_job( - self, - job_name: str, - service_type: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Create a new scheduled job""" - logger.info(f"📅 Creating scheduled job: {job_name}") - - request = ScheduledJobCreate( - job_name=job_name, - service_type=service_type, - cron_expression=cron_expression, - job_config=job_config, - is_active=is_active, + logger.info("Fetching history") + filter_obj = HistoryFilter( + endpoint=endpoint, status=status, limit=limit, offset=offset ) - - result = self._make_request( - "POST", f"{API_BASE_URL}/scheduled-jobs", json=request.model_dump() + params = filter_obj.to_params() + return self._make_request( + "GET", f"{self.base_url}/history", params=params or None ) - logger.info("✨ Scheduled job created successfully") - return result - - def get_scheduled_jobs( - self, - page: int = 1, - page_size: int = 20, - service_type: Optional[str] = None, - is_active: Optional[bool] = None, - ): - """Get list of scheduled jobs with pagination""" - logger.info("📋 Fetching scheduled jobs") - - GetScheduledJobsRequest( - page=page, - page_size=page_size, - service_type=service_type, - is_active=is_active, - ) - - params = {"page": page, "page_size": page_size} - if service_type: - params["service_type"] = service_type - if is_active is not None: - params["is_active"] = is_active - - result = self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs", params=params) - logger.info(f"✨ Successfully retrieved {len(result.get('jobs', []))} scheduled jobs") - return result - - def get_scheduled_job(self, job_id: str): - """Get details of a specific scheduled job""" - logger.info(f"🔍 Fetching scheduled job {job_id}") - - GetScheduledJobRequest(job_id=job_id) - - result = self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully retrieved scheduled job {job_id}") - return result - def update_scheduled_job( - self, - job_id: str, - job_name: Optional[str] = None, - cron_expression: Optional[str] = None, - job_config: Optional[dict] = None, - is_active: Optional[bool] = None, - ): - """Update an existing scheduled job (partial update)""" - logger.info(f"📝 Updating scheduled job {job_id}") - - update_data = {} - if job_name is not None: - update_data["job_name"] = job_name - if cron_expression is not None: - update_data["cron_expression"] = cron_expression - if job_config is not None: - update_data["job_config"] = job_config - if is_active is not None: - update_data["is_active"] = is_active - - ScheduledJobUpdate(**update_data) - - result = self._make_request( - "PATCH", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=update_data - ) - logger.info(f"✨ Successfully updated scheduled job {job_id}") - return result - - def replace_scheduled_job( - self, - job_id: str, - job_name: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Replace an existing scheduled job (full update)""" - logger.info(f"🔄 Replacing scheduled job {job_id}") - - request_data = { - "job_name": job_name, - "cron_expression": cron_expression, - "job_config": job_config, - "is_active": is_active, - } - - result = self._make_request( - "PUT", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=request_data - ) - logger.info(f"✨ Successfully replaced scheduled job {job_id}") - return result - - def delete_scheduled_job(self, job_id: str): - """Delete a scheduled job""" - logger.info(f"🗑️ Deleting scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = self._make_request("DELETE", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully deleted scheduled job {job_id}") - return result - - def pause_scheduled_job(self, job_id: str): - """Pause a scheduled job""" - logger.info(f"⏸️ Pausing scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/pause") - logger.info(f"✨ Successfully paused scheduled job {job_id}") - return result - - def resume_scheduled_job(self, job_id: str): - """Resume a paused scheduled job""" - logger.info(f"▶️ Resuming scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/resume") - logger.info(f"✨ Successfully resumed scheduled job {job_id}") - return result - - def trigger_scheduled_job(self, job_id: str): - """Manually trigger a scheduled job""" - logger.info(f"🚀 Manually triggering scheduled job {job_id}") - - TriggerJobRequest(job_id=job_id) - - result = self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/trigger") - logger.info(f"✨ Successfully triggered scheduled job {job_id}") - return result - - def get_job_executions( - self, - job_id: str, - page: int = 1, - page_size: int = 20, - status: Optional[str] = None, - ): - """Get execution history for a scheduled job""" - logger.info(f"📊 Fetching execution history for job {job_id}") - - GetJobExecutionsRequest( - job_id=job_id, - page=page, - page_size=page_size, - status=status, - ) - - params = {"page": page, "page_size": page_size} - if status: - params["status"] = status - - result = self._make_request( - "GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}/executions", params=params - ) - logger.info(f"✨ Successfully retrieved execution history for job {job_id}") - return result + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ - def close(self): - """Close the session to free up resources""" - logger.info("🔒 Closing Client session") + def close(self) -> None: + """Close the HTTP session.""" + logger.info("Closing Client session") self.session.close() - logger.debug("✅ Session closed successfully") - def __enter__(self): + def __enter__(self) -> "Client": return self - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: self.close() diff --git a/scrapegraph-py/scrapegraph_py/config.py b/scrapegraph-py/scrapegraph_py/config.py index e7ca178..a9b7d08 100644 --- a/scrapegraph-py/scrapegraph_py/config.py +++ b/scrapegraph-py/scrapegraph_py/config.py @@ -1,15 +1,11 @@ """ -Configuration and constants for the ScrapeGraphAI SDK. - -This module contains API configuration settings including the base URL -and default headers used for all API requests. - -Attributes: - API_BASE_URL (str): Base URL for the ScrapeGraphAI API endpoints - DEFAULT_HEADERS (dict): Default HTTP headers for API requests +Configuration and constants for the ScrapeGraphAI SDK v2. """ -API_BASE_URL = "https://api.scrapegraphai.com/v1" + +VERSION = "2.0.0" +API_BASE_URL = "https://api.scrapegraphai.com/api/v2" DEFAULT_HEADERS = { "accept": "application/json", "Content-Type": "application/json", + "X-SDK-Version": f"python@{VERSION}", } diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index 1f374b8..49fa62b 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -1,57 +1,32 @@ """ -Pydantic models for all ScrapeGraphAI API endpoints. - -This module provides request and response models for validating and -structuring data for all API operations. All models use Pydantic for -data validation and serialization. - -Available Models: - - AgenticScraperRequest, GetAgenticScraperRequest: Agentic scraper operations - - CrawlRequest, GetCrawlRequest: Website crawling operations - - FeedbackRequest: User feedback submission - - ScrapeRequest, GetScrapeRequest: Basic HTML scraping - - MarkdownifyRequest, GetMarkdownifyRequest: Markdown conversion - - SearchScraperRequest, GetSearchScraperRequest: Web research - - SmartScraperRequest, GetSmartScraperRequest: AI-powered scraping - - GenerateSchemaRequest, GetSchemaStatusRequest: Schema generation - - ScheduledJob models: Job scheduling and management - -Example: - >>> from scrapegraph_py.models import SmartScraperRequest - >>> request = SmartScraperRequest( - ... website_url="https://example.com", - ... user_prompt="Extract product info" - ... ) +Pydantic models for the ScrapeGraphAI v2 API. """ -from .agenticscraper import AgenticScraperRequest, GetAgenticScraperRequest -from .crawl import CrawlRequest, GetCrawlRequest -from .feedback import FeedbackRequest -from .scrape import GetScrapeRequest, ScrapeRequest -from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from .searchscraper import GetSearchScraperRequest, SearchScraperRequest, TimeRange -from .sitemap import SitemapRequest, SitemapResponse -from .smartscraper import GetSmartScraperRequest, SmartScraperRequest -from .schema import GenerateSchemaRequest, GetSchemaStatusRequest, SchemaGenerationResponse +from .shared import FetchConfig, LlmConfig +from .scrape import ScrapeFormat, ScrapeRequest, GetScrapeRequest +from .extract import ExtractRequest +from .search import SearchRequest +from .crawl import CrawlFormat, CrawlRequest +from .monitor import MonitorCreateRequest +from .history import HistoryFilter __all__ = [ - "AgenticScraperRequest", - "GetAgenticScraperRequest", - "CrawlRequest", - "GetCrawlRequest", - "FeedbackRequest", - "GetScrapeRequest", + # Shared + "FetchConfig", + "LlmConfig", + # Scrape + "ScrapeFormat", "ScrapeRequest", - "GetMarkdownifyRequest", - "MarkdownifyRequest", - "GetSearchScraperRequest", - "SearchScraperRequest", - "TimeRange", - "SitemapRequest", - "SitemapResponse", - "GetSmartScraperRequest", - "SmartScraperRequest", - "GenerateSchemaRequest", - "GetSchemaStatusRequest", - "SchemaGenerationResponse", + "GetScrapeRequest", + # Extract + "ExtractRequest", + # Search + "SearchRequest", + # Crawl + "CrawlFormat", + "CrawlRequest", + # Monitor + "MonitorCreateRequest", + # History + "HistoryFilter", ] diff --git a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py b/scrapegraph-py/scrapegraph_py/models/agenticscraper.py deleted file mode 100644 index 93b6234..0000000 --- a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Pydantic models for the Agentic Scraper API endpoint. - -This module defines request and response models for the Agentic Scraper endpoint, -which performs automated browser interactions and optional AI data extraction. - -The Agentic Scraper can: -- Execute a sequence of browser actions (click, type, scroll, etc.) -- Handle authentication flows and form submissions -- Optionally extract structured data using AI after interactions -- Maintain browser sessions across multiple steps -""" - -from typing import Any, Dict, List, Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class AgenticScraperRequest(BaseModel): - """ - Request model for the Agentic Scraper endpoint. - - This model validates and structures requests for automated browser - interactions with optional AI extraction. - - Attributes: - url: The starting URL for the scraping session - use_session: Whether to maintain browser session across steps - steps: List of actions to perform (e.g., "Type email@example.com in email input") - user_prompt: Optional prompt for AI extraction (required if ai_extraction=True) - output_schema: Optional schema for structured data extraction - ai_extraction: Whether to use AI for data extraction after interactions - headers: Optional HTTP headers - mock: Whether to use mock mode for testing - render_heavy_js: Whether to render heavy JavaScript - - Example: - >>> request = AgenticScraperRequest( - ... url="https://dashboard.example.com", - ... steps=[ - ... "Type user@example.com in email input", - ... "Type password123 in password input", - ... "Click login button" - ... ], - ... ai_extraction=True, - ... user_prompt="Extract user dashboard information" - ... ) - """ - url: str = Field( - ..., - example="https://dashboard.scrapegraphai.com/", - description="The URL to scrape" - ) - use_session: bool = Field( - default=True, - description="Whether to use session for the scraping" - ) - steps: List[str] = Field( - ..., - example=[ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ], - description="List of steps to perform on the webpage" - ) - user_prompt: Optional[str] = Field( - default=None, - example="Extract user information and available dashboard sections", - description="Prompt for AI extraction (only used when ai_extraction=True)" - ) - output_schema: Optional[Dict[str, Any]] = Field( - default=None, - example={ - "user_info": { - "type": "object", - "properties": { - "username": {"type": "string"}, - "email": {"type": "string"}, - "dashboard_sections": {"type": "array", "items": {"type": "string"}} - } - } - }, - description="Schema for structured data extraction (only used when ai_extraction=True)" - ) - ai_extraction: bool = Field( - default=False, - description="Whether to use AI for data extraction from the scraped content" - ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - - @model_validator(mode="after") - def validate_url(self) -> "AgenticScraperRequest": - if not self.url.strip(): - raise ValueError("URL cannot be empty") - if not ( - self.url.startswith("http://") - or self.url.startswith("https://") - ): - raise ValueError("Invalid URL - must start with http:// or https://") - return self - - @model_validator(mode="after") - def validate_steps(self) -> "AgenticScraperRequest": - if not self.steps: - raise ValueError("Steps cannot be empty") - if any(not step.strip() for step in self.steps): - raise ValueError("All steps must contain valid instructions") - return self - - @model_validator(mode="after") - def validate_ai_extraction(self) -> "AgenticScraperRequest": - if self.ai_extraction: - if not self.user_prompt or not self.user_prompt.strip(): - raise ValueError("user_prompt is required when ai_extraction=True") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class GetAgenticScraperRequest(BaseModel): - """Request model for get_agenticscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetAgenticScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py index dd6cca9..1b7458c 100644 --- a/scrapegraph-py/scrapegraph_py/models/crawl.py +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -1,219 +1,61 @@ -# Models for crawl endpoint +""" +Pydantic models for the v2 Crawl endpoints. -from typing import Any, Dict, Optional -from uuid import UUID +POST /v2/crawl - Start a crawl job +GET /v2/crawl/:id - Get crawl status/results +POST /v2/crawl/:id/stop - Stop a running crawl +POST /v2/crawl/:id/resume - Resume a stopped crawl +""" + +from enum import Enum +from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field, conint, model_validator +from .shared import FetchConfig -class CrawlRequest(BaseModel): - """ - Request model for the crawl endpoint. - The crawl endpoint supports two modes: - 1. AI Extraction Mode (extraction_mode=True): Uses AI to extract structured data - 2. Markdown Conversion Mode (extraction_mode=False): Converts pages to markdown (80% cheaper) +class CrawlFormat(str, Enum): + """Output format for crawled pages.""" - Sitemap Support: - - When sitemap=True, the crawler uses sitemap.xml for better page discovery - - Recommended for structured websites (e-commerce, news sites, blogs) - - Provides more comprehensive crawling coverage - - Works with both AI extraction and markdown conversion modes + MARKDOWN = "markdown" + HTML = "html" - Path Filtering: - - include_paths: Specify which paths to crawl (e.g., ['/products/*', '/blog/**']) - - exclude_paths: Specify which paths to skip (e.g., ['/admin/*', '/api/*']) - - Supports wildcards: * (any characters), ** (any path segments) - - exclude_paths takes precedence over include_paths - """ - url: str = Field( - ..., - example="https://scrapegraphai.com/", - description="The starting URL for the crawl", - ) - extraction_mode: bool = Field( - default=True, - description="True for AI extraction mode, False for markdown conversion " - "mode (no AI/LLM processing)", - ) - prompt: Optional[str] = Field( - default=None, - example="What does the company do? and I need text content from there " - "privacy and terms", - description="The prompt to guide the crawl and extraction (required when " - "extraction_mode=True)", - ) - data_schema: Optional[Dict[str, Any]] = Field( - default=None, - description="JSON schema defining the structure of the extracted data " - "(required when extraction_mode=True)", - ) - cache_website: bool = Field( - default=True, description="Whether to cache the website content" - ) + +class CrawlRequest(BaseModel): + """Request model for POST /v2/crawl.""" + + url: str = Field(..., description="The starting URL for the crawl") depth: conint(ge=1, le=10) = Field( - default=2, description="Maximum depth of the crawl (1-10)" - ) - breadth: Optional[conint(ge=1)] = Field( - default=None, - description="Maximum number of links to crawl per depth level. " - "If None, unlimited (default). Controls the 'width' of exploration at each depth. " - "Useful for limiting crawl scope on large sites. Note: max_pages always takes priority - " - "the total crawled pages will never exceed max_pages regardless of breadth setting. " - "Ignored when sitemap=True (sitemap mode uses sitemap URLs directly instead of link discovery).", + default=2, description="Maximum crawl depth (1-10)" ) max_pages: conint(ge=1, le=100) = Field( - default=2, description="Maximum number of pages to crawl (1-100)" - ) - same_domain_only: bool = Field( - default=True, description="Whether to only crawl pages from the same domain" - ) - batch_size: Optional[conint(ge=1, le=10)] = Field( - default=None, description="Batch size for processing pages (1-10)" - ) - sitemap: bool = Field( - default=False, - description="Whether to use sitemap.xml for better page discovery and more comprehensive crawling. " - "When enabled, the crawler will use the website's sitemap.xml to discover pages more efficiently, " - "providing better coverage for structured websites like e-commerce sites, news portals, and content-heavy websites." + default=10, description="Maximum number of pages to crawl (1-100)" ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", + format: CrawlFormat = Field( + default=CrawlFormat.MARKDOWN, + description="Output format: markdown or html", ) - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - include_paths: Optional[list[str]] = Field( + include_patterns: Optional[List[str]] = Field( default=None, - description="List of path patterns to include (e.g., ['/products/*', '/blog/**']). " - "Supports wildcards: * matches any characters, ** matches any path segments. " - "If empty, all paths are included.", - example=["/products/*", "/blog/**"] + description="URL patterns to include (e.g. ['/products/*', '/blog/**'])", ) - exclude_paths: Optional[list[str]] = Field( + exclude_patterns: Optional[List[str]] = Field( default=None, - description="List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). " - "Supports wildcards: * matches any characters, ** matches any path segments. " - "Takes precedence over include_paths.", - example=["/admin/*", "/api/**"] + description="URL patterns to exclude (e.g. ['/admin/*', '/api/*'])", ) - webhook_url: Optional[str] = Field( - default=None, - description="URL to receive webhook notifications when the crawl job completes. " - "The webhook will receive a POST request with the crawl results.", - example="https://example.com/webhook" - ) - wait_ms: Optional[int] = Field( - default=None, - description="Number of milliseconds to wait before scraping each page. " - "Useful for pages with heavy JavaScript rendering that need extra time to load.", + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" ) @model_validator(mode="after") def validate_url(self) -> "CrawlRequest": - if not self.url.strip(): + if not self.url or not self.url.strip(): raise ValueError("URL cannot be empty") if not (self.url.startswith("http://") or self.url.startswith("https://")): - raise ValueError("Invalid URL - must start with http:// or https://") - return self - - @model_validator(mode="after") - def validate_extraction_mode_requirements(self) -> "CrawlRequest": - """Validate requirements based on extraction mode""" - if self.extraction_mode: - # AI extraction mode - require prompt and data_schema - if not self.prompt: - raise ValueError("Prompt is required when extraction_mode=True") - if not self.prompt.strip(): - raise ValueError("Prompt cannot be empty") - if not any(c.isalnum() for c in self.prompt): - raise ValueError("Prompt must contain valid content") - - if not self.data_schema: - raise ValueError("Data schema is required when extraction_mode=True") - if not isinstance(self.data_schema, dict): - raise ValueError("Data schema must be a dictionary") - if not self.data_schema: - raise ValueError("Data schema cannot be empty") - else: - # Markdown conversion mode - prompt and data_schema should be None - if self.prompt is not None: - raise ValueError( - "Prompt should not be provided when extraction_mode=False " - "(markdown mode)" - ) - if self.data_schema is not None: - raise ValueError( - "Data schema should not be provided when extraction_mode=False " - "(markdown mode)" - ) - - return self - - @model_validator(mode="after") - def validate_batch_size(self) -> "CrawlRequest": - if self.batch_size is not None and ( - self.batch_size < 1 or self.batch_size > 10 - ): - raise ValueError("Batch size must be between 1 and 10") - return self - - @model_validator(mode="after") - def validate_sitemap_usage(self) -> "CrawlRequest": - """Validate sitemap usage and provide recommendations""" - if self.sitemap: - # Log recommendation for sitemap usage - if self.max_pages < 5: - # This is just a recommendation, not an error - pass # Could add logging here if needed - return self - - @model_validator(mode="after") - def validate_path_patterns(self) -> "CrawlRequest": - """Validate path patterns start with '/'""" - if self.include_paths: - for path in self.include_paths: - if not path.startswith("/"): - raise ValueError(f"Include path must start with '/': {path}") - - if self.exclude_paths: - for path in self.exclude_paths: - if not path.startswith("/"): - raise ValueError(f"Exclude path must start with '/': {path}") - - return self - - @model_validator(mode="after") - def validate_webhook_url(self) -> "CrawlRequest": - """Validate webhook URL format if provided""" - if self.webhook_url is not None: - if not self.webhook_url.strip(): - raise ValueError("Webhook URL cannot be empty") - if not ( - self.webhook_url.startswith("http://") - or self.webhook_url.startswith("https://") - ): - raise ValueError( - "Invalid webhook URL - must start with http:// or https://" - ) + raise ValueError("URL must start with http:// or https://") return self - -class GetCrawlRequest(BaseModel): - """Request model for get_crawl endpoint""" - - crawl_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_crawl_id(self) -> "GetCrawlRequest": - try: - # Validate the crawl_id is a valid UUID - UUID(self.crawl_id) - except ValueError: - raise ValueError("crawl_id must be a valid UUID") - return self + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/extract.py b/scrapegraph-py/scrapegraph_py/models/extract.py new file mode 100644 index 0000000..3b3381b --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/extract.py @@ -0,0 +1,47 @@ +""" +Pydantic models for the v2 Extract endpoint. + +POST /v2/extract - AI-powered data extraction (replaces SmartScraper). +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, model_validator + +from .shared import FetchConfig, LlmConfig + + +class ExtractRequest(BaseModel): + """Request model for POST /v2/extract.""" + + url: str = Field(..., description="URL of the page to extract data from") + prompt: str = Field(..., description="Natural language prompt describing what to extract") + output_schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON Schema defining the structure of the extracted data", + ) + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" + ) + llm_config: Optional[LlmConfig] = Field( + default=None, description="LLM configuration options" + ) + + @model_validator(mode="after") + def validate_fields(self) -> "ExtractRequest": + if not self.url or not self.url.strip(): + raise ValueError("URL cannot be empty") + if not (self.url.startswith("http://") or self.url.startswith("https://")): + raise ValueError("URL must start with http:// or https://") + if not self.prompt or not self.prompt.strip(): + raise ValueError("Prompt cannot be empty") + return self + + def to_api_payload(self) -> Dict[str, Any]: + """Convert to API payload, handling Pydantic BaseModel output_schema.""" + data = self.model_dump(exclude_none=True) + return data + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/feedback.py b/scrapegraph-py/scrapegraph_py/models/feedback.py deleted file mode 100644 index 43c41ec..0000000 --- a/scrapegraph-py/scrapegraph_py/models/feedback.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Pydantic models for the Feedback API endpoint. - -This module defines request models for submitting user feedback about -API requests, helping improve the service quality. -""" - -from typing import Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class FeedbackRequest(BaseModel): - """Request model for feedback endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - rating: int = Field(..., ge=1, le=5, example=5) - feedback_text: Optional[str] = Field(None, example="Great results!") - - @model_validator(mode="after") - def validate_request_id(self) -> "FeedbackRequest": - try: - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/history.py b/scrapegraph-py/scrapegraph_py/models/history.py new file mode 100644 index 0000000..164b25f --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/history.py @@ -0,0 +1,30 @@ +""" +Pydantic models for the v2 History endpoint. + +GET /v2/history - Retrieve request history with optional filters. +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +class HistoryFilter(BaseModel): + """Query parameters for GET /v2/history.""" + + endpoint: Optional[str] = Field( + default=None, description="Filter by endpoint name (e.g. 'scrape', 'extract')" + ) + status: Optional[str] = Field( + default=None, description="Filter by request status" + ) + limit: Optional[int] = Field( + default=None, ge=1, le=100, description="Maximum number of results (1-100)" + ) + offset: Optional[int] = Field( + default=None, ge=0, description="Number of results to skip" + ) + + def to_params(self) -> Dict[str, Any]: + """Convert to query parameter dict, excluding None values.""" + return {k: v for k, v in self.model_dump().items() if v is not None} diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py deleted file mode 100644 index 0b95903..0000000 --- a/scrapegraph-py/scrapegraph_py/models/markdownify.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Pydantic models for the Markdownify API endpoint. - -This module defines request and response models for the Markdownify endpoint, -which converts web pages into clean markdown format. - -The Markdownify endpoint is useful for: -- Converting HTML to markdown for easier processing -- Extracting clean text content from websites -- Preparing content for LLM consumption -""" - -from typing import Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class MarkdownifyRequest(BaseModel): - """ - Request model for the Markdownify endpoint. - - This model validates and structures requests for converting web pages - to markdown format. - - Attributes: - website_url: URL of the website to convert to markdown - headers: Optional HTTP headers including cookies - mock: Whether to use mock mode for testing - render_heavy_js: Whether to render heavy JavaScript on the page - stealth: Enable stealth mode to avoid bot detection - - Example: - >>> request = MarkdownifyRequest(website_url="https://example.com") - """ - website_url: str = Field(..., example="https://scrapegraphai.com/") - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") - - @model_validator(mode="after") - def validate_url(self) -> "MarkdownifyRequest": - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class GetMarkdownifyRequest(BaseModel): - """Request model for get_markdownify endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetMarkdownifyRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/monitor.py b/scrapegraph-py/scrapegraph_py/models/monitor.py new file mode 100644 index 0000000..809c7d6 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/monitor.py @@ -0,0 +1,54 @@ +""" +Pydantic models for the v2 Monitor endpoints. + +POST /v2/monitor - Create a monitor +GET /v2/monitor - List monitors +GET /v2/monitor/:id - Get a monitor +POST /v2/monitor/:id/pause - Pause a monitor +POST /v2/monitor/:id/resume - Resume a monitor +DELETE /v2/monitor/:id - Delete a monitor +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, model_validator + +from .shared import FetchConfig, LlmConfig + + +class MonitorCreateRequest(BaseModel): + """Request model for POST /v2/monitor.""" + + name: str = Field(..., description="Name of the monitor") + url: str = Field(..., description="URL to monitor") + prompt: str = Field(..., description="Prompt for AI extraction") + cron: str = Field(..., description="Cron expression for scheduling (5 fields)") + output_schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON Schema defining the structure of extracted data", + ) + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" + ) + llm_config: Optional[LlmConfig] = Field( + default=None, description="LLM configuration options" + ) + + @model_validator(mode="after") + def validate_fields(self) -> "MonitorCreateRequest": + if not self.name or not self.name.strip(): + raise ValueError("Name cannot be empty") + if not self.url or not self.url.strip(): + raise ValueError("URL cannot be empty") + if not (self.url.startswith("http://") or self.url.startswith("https://")): + raise ValueError("URL must start with http:// or https://") + if not self.prompt or not self.prompt.strip(): + raise ValueError("Prompt cannot be empty") + parts = self.cron.strip().split() + if len(parts) != 5: + raise ValueError("Cron expression must have exactly 5 fields") + return self + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py b/scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py deleted file mode 100644 index 46e83d6..0000000 --- a/scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py +++ /dev/null @@ -1,151 +0,0 @@ -""" -Pydantic models for the Scheduled Jobs API endpoints. - -This module defines request and response models for managing scheduled jobs, -which allow you to automate recurring scraping tasks using cron expressions. - -Scheduled Jobs support: -- Creating recurring scraping jobs -- Managing job lifecycle (pause, resume, delete) -- Manually triggering jobs on demand -- Viewing execution history -- Filtering and pagination -""" - -from typing import Any, Dict, Optional -from enum import Enum -from pydantic import BaseModel, Field, model_validator - - -class ServiceType(str, Enum): - """ - Enum defining available service types for scheduled jobs. - - Available services: - SMART_SCRAPER: AI-powered web scraping - SEARCH_SCRAPER: Web research across multiple sources - AGENTIC_SCRAPER: Automated browser interactions - """ - SMART_SCRAPER = "smartscraper" - SEARCH_SCRAPER = "searchscraper" - AGENTIC_SCRAPER = "agenticscraper" - - -class ScheduledJobCreate(BaseModel): - """Model for creating a new scheduled job""" - job_name: str = Field(..., min_length=1, description="Name of the scheduled job") - service_type: str = Field(..., description="Type of service (smartscraper, searchscraper, etc.)") - cron_expression: str = Field(..., description="Cron expression for scheduling") - job_config: Dict[str, Any] = Field( - ..., - example={ - "website_url": "https://example.com", - "user_prompt": "Extract company information", - "headers": { - "User-Agent": "scrapegraph-py", - "Cookie": "session=abc123" - } - }, - description="Configuration for the job" - ) - is_active: bool = Field(default=True, description="Whether the job is active") - - @model_validator(mode="after") - def validate_cron_expression(self) -> "ScheduledJobCreate": - parts = self.cron_expression.strip().split() - if len(parts) != 5: - raise ValueError("Cron expression must have exactly 5 fields") - return self - - -class ScheduledJobUpdate(BaseModel): - """Model for updating a scheduled job (partial update)""" - job_name: Optional[str] = Field(None, description="Name of the scheduled job") - cron_expression: Optional[str] = Field(None, description="Cron expression for scheduling") - job_config: Optional[Dict[str, Any]] = Field(None, description="Configuration for the job") - is_active: Optional[bool] = Field(None, description="Whether the job is active") - - -class GetScheduledJobsRequest(BaseModel): - """Model for getting list of scheduled jobs""" - page: int = Field(default=1, ge=1, description="Page number") - page_size: int = Field(default=20, ge=1, le=100, description="Number of jobs per page") - service_type: Optional[str] = Field(None, description="Filter by service type") - is_active: Optional[bool] = Field(None, description="Filter by active status") - - -class GetScheduledJobRequest(BaseModel): - """Model for getting a specific scheduled job""" - job_id: str = Field(..., description="ID of the scheduled job") - - -class JobActionRequest(BaseModel): - """Model for job actions (pause, resume, delete)""" - job_id: str = Field(..., description="ID of the scheduled job") - - -class TriggerJobRequest(BaseModel): - """Model for manually triggering a job""" - job_id: str = Field(..., description="ID of the scheduled job") - - -class GetJobExecutionsRequest(BaseModel): - """Model for getting job execution history""" - job_id: str = Field(..., description="ID of the scheduled job") - page: int = Field(default=1, ge=1, description="Page number") - page_size: int = Field(default=20, ge=1, le=100, description="Number of executions per page") - status: Optional[str] = Field(None, description="Filter by execution status") - - -class JobActionResponse(BaseModel): - """Response model for job actions""" - success: bool = Field(..., description="Whether the action was successful") - message: str = Field(..., description="Response message") - job_id: str = Field(..., description="ID of the scheduled job") - - -class JobExecutionListResponse(BaseModel): - """Response model for job execution list""" - executions: list = Field(..., description="List of job executions") - total_count: int = Field(..., description="Total number of executions") - page: int = Field(..., description="Current page number") - page_size: int = Field(..., description="Number of executions per page") - - -class JobTriggerResponse(BaseModel): - """Response model for job trigger""" - success: bool = Field(..., description="Whether the job was triggered successfully") - message: str = Field(..., description="Response message") - job_id: str = Field(..., description="ID of the scheduled job") - execution_id: Optional[str] = Field(None, description="ID of the triggered execution") - - -class ScheduledJobListResponse(BaseModel): - """Response model for scheduled job list""" - jobs: list = Field(..., description="List of scheduled jobs") - total_count: int = Field(..., description="Total number of jobs") - page: int = Field(..., description="Current page number") - page_size: int = Field(..., description="Number of jobs per page") - - -class JobExecutionResponse(BaseModel): - """Response model for a single job execution""" - execution_id: str = Field(..., description="ID of the job execution") - job_id: str = Field(..., description="ID of the scheduled job") - status: str = Field(..., description="Execution status") - started_at: Optional[str] = Field(None, description="Execution start timestamp") - completed_at: Optional[str] = Field(None, description="Execution completion timestamp") - result: Optional[Dict[str, Any]] = Field(None, description="Execution result data") - error_message: Optional[str] = Field(None, description="Error message if execution failed") - - -class ScheduledJobResponse(BaseModel): - """Response model for a single scheduled job""" - job_id: str = Field(..., description="ID of the scheduled job") - job_name: str = Field(..., description="Name of the scheduled job") - service_type: str = Field(..., description="Type of service") - cron_expression: str = Field(..., description="Cron expression for scheduling") - job_config: Dict[str, Any] = Field(..., description="Configuration for the job") - is_active: bool = Field(..., description="Whether the job is active") - created_at: Optional[str] = Field(None, description="Job creation timestamp") - updated_at: Optional[str] = Field(None, description="Job last update timestamp") \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/models/schema.py b/scrapegraph-py/scrapegraph_py/models/schema.py deleted file mode 100644 index d747f4b..0000000 --- a/scrapegraph-py/scrapegraph_py/models/schema.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Pydantic models for the Schema Generation API endpoint. - -This module defines request and response models for the Schema Generation endpoint, -which uses AI to generate or refine JSON schemas based on user prompts. - -The Schema Generation endpoint can: -- Generate new schemas from natural language descriptions -- Refine and extend existing schemas -- Create structured data models for web scraping -""" - -from typing import Any, Dict, Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class GenerateSchemaRequest(BaseModel): - """Request model for generate_schema endpoint""" - - user_prompt: str = Field( - ..., - example="Find laptops with specifications like brand, processor, RAM, storage, and price", - description="The user's search query to be refined into a schema" - ) - existing_schema: Optional[Dict[str, Any]] = Field( - default=None, - example={ - "$defs": { - "ProductSchema": { - "title": "ProductSchema", - "type": "object", - "properties": { - "name": {"title": "Name", "type": "string"}, - "price": {"title": "Price", "type": "number"}, - }, - "required": ["name", "price"], - } - } - }, - description="Optional existing JSON schema to modify/extend" - ) - - @model_validator(mode="after") - def validate_user_prompt(self) -> "GenerateSchemaRequest": - if not self.user_prompt or not self.user_prompt.strip(): - raise ValueError("user_prompt cannot be empty") - self.user_prompt = self.user_prompt.strip() - return self - - def model_dump(self, *args, **kwargs) -> dict: - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class GetSchemaStatusRequest(BaseModel): - """Request model for get_schema_status endpoint""" - - request_id: str = Field( - ..., - example="123e4567-e89b-12d3-a456-426614174000", - description="The request ID returned from generate_schema" - ) - - @model_validator(mode="after") - def validate_request_id(self) -> "GetSchemaStatusRequest": - self.request_id = self.request_id.strip() - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self - - -class SchemaGenerationResponse(BaseModel): - """Response model for schema generation endpoints""" - - request_id: str = Field( - ..., - description="Unique identifier for the schema generation request" - ) - status: str = Field( - ..., - example="completed", - description="Status of the schema generation (pending, processing, completed, failed)" - ) - user_prompt: str = Field( - ..., - description="The original user prompt that was processed" - ) - refined_prompt: Optional[str] = Field( - default=None, - description="AI-refined version of the user prompt" - ) - generated_schema: Optional[Dict[str, Any]] = Field( - default=None, - description="The generated JSON schema" - ) - error: Optional[str] = Field( - default=None, - description="Error message if the request failed" - ) - created_at: Optional[str] = Field( - default=None, - description="Timestamp when the request was created" - ) - updated_at: Optional[str] = Field( - default=None, - description="Timestamp when the request was last updated" - ) - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/scrape.py b/scrapegraph-py/scrapegraph_py/models/scrape.py index a080957..3155612 100644 --- a/scrapegraph-py/scrapegraph_py/models/scrape.py +++ b/scrapegraph-py/scrapegraph_py/models/scrape.py @@ -1,91 +1,96 @@ """ -Pydantic models for the Scrape API endpoint. +Pydantic models for the v2 Scrape endpoint. -This module defines request and response models for the basic Scrape endpoint, -which retrieves raw HTML content from websites. - -The Scrape endpoint is useful for: -- Getting clean HTML content from websites -- Handling JavaScript-heavy sites -- Preprocessing before AI extraction +POST /api/v1/scrape - Fetch a page in a given format (markdown, html, screenshot, branding). """ -from typing import Optional -from uuid import UUID +from enum import Enum +from typing import Any, Dict, Literal, Optional from pydantic import BaseModel, Field, model_validator +from .shared import FetchConfig + + +class ScrapeFormat(str, Enum): + """Output format for the scrape endpoint.""" + + MARKDOWN = "markdown" + HTML = "html" + SCREENSHOT = "screenshot" + BRANDING = "branding" + + +class MarkdownConfig(BaseModel): + """Configuration for markdown output.""" + + mode: str = Field(default="normal", description="Markdown mode (normal, etc.)") + + +class HtmlConfig(BaseModel): + """Configuration for html output.""" + + mode: str = Field(default="normal", description="HTML mode") + + +class ScreenshotConfig(BaseModel): + """Configuration for screenshot output.""" + + full_page: bool = Field(default=False, description="Capture full page") + class ScrapeRequest(BaseModel): + """Request model for POST /api/v1/scrape. + + The API expects a format-specific config key in the body, e.g.: + {"url": "...", "markdown": {"mode": "normal"}} + {"url": "...", "html": {"mode": "normal"}} + {"url": "...", "screenshot": {"full_page": false}} """ - Request model for the Scrape endpoint. - - This model validates and structures requests for basic HTML scraping - without AI extraction. - - Attributes: - website_url: URL of the website to scrape - render_heavy_js: Whether to render heavy JavaScript (default: False) - branding: Whether to include branding in the response (default: False) - headers: Optional HTTP headers including cookies - mock: Whether to use mock mode for testing - - Example: - >>> request = ScrapeRequest( - ... website_url="https://example.com", - ... render_heavy_js=True, - ... branding=True - ... ) - """ - website_url: str = Field(..., example="https://scrapegraphai.com/") - render_heavy_js: bool = Field( - False, - description="Whether to render heavy JavaScript (defaults to False)", - ) - branding: bool = Field( - False, - description="Whether to include branding in the response (defaults to False)", + + url: str = Field(..., description="URL of the page to scrape") + format: ScrapeFormat = Field( + default=ScrapeFormat.MARKDOWN, + description="Output format: markdown, html, screenshot, or branding", + exclude=True, ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", + markdown: Optional[MarkdownConfig] = Field(default=None) + html: Optional[HtmlConfig] = Field(default=None) + screenshot: Optional[ScreenshotConfig] = Field(default=None) + branding: Optional[Dict[str, Any]] = Field(default=None) + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") @model_validator(mode="after") def validate_url(self) -> "ScrapeRequest": - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") + if not self.url or not self.url.strip(): + raise ValueError("URL cannot be empty") + if not (self.url.startswith("http://") or self.url.startswith("https://")): + raise ValueError("URL must start with http:// or https://") return self - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization + @model_validator(mode="after") + def set_format_config(self) -> "ScrapeRequest": + """Auto-populate the format config key if none were explicitly set.""" + has_any = any([self.markdown, self.html, self.screenshot, self.branding]) + if not has_any: + if self.format == ScrapeFormat.MARKDOWN: + self.markdown = MarkdownConfig() + elif self.format == ScrapeFormat.HTML: + self.html = HtmlConfig() + elif self.format == ScrapeFormat.SCREENSHOT: + self.screenshot = ScreenshotConfig() + elif self.format == ScrapeFormat.BRANDING: + self.branding = {} + return self + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: kwargs.setdefault("exclude_none", True) return super().model_dump(*args, **kwargs) class GetScrapeRequest(BaseModel): - """Request model for get_scrape endpoint""" + """Request model for GET /api/v1/scrape/:id.""" - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetScrapeRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self + request_id: str = Field(..., description="The request ID to fetch") diff --git a/scrapegraph-py/scrapegraph_py/models/search.py b/scrapegraph-py/scrapegraph_py/models/search.py new file mode 100644 index 0000000..a465561 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/search.py @@ -0,0 +1,37 @@ +""" +Pydantic models for the v2 Search endpoint. + +POST /v2/search - Web search with AI extraction (replaces SearchScraper). +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, conint, model_validator + +from .shared import LlmConfig + + +class SearchRequest(BaseModel): + """Request model for POST /v2/search.""" + + query: str = Field(..., description="The search query") + num_results: conint(ge=3, le=20) = Field( + default=5, description="Number of results to return (3-20)" + ) + output_schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON Schema defining the structure of the extracted data", + ) + llm_config: Optional[LlmConfig] = Field( + default=None, description="LLM configuration options" + ) + + @model_validator(mode="after") + def validate_query(self) -> "SearchRequest": + if not self.query or not self.query.strip(): + raise ValueError("Query cannot be empty") + return self + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/searchscraper.py b/scrapegraph-py/scrapegraph_py/models/searchscraper.py deleted file mode 100644 index d143f97..0000000 --- a/scrapegraph-py/scrapegraph_py/models/searchscraper.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Pydantic models for the SearchScraper API endpoint. - -This module defines request and response models for the SearchScraper endpoint, -which performs AI-powered web research by searching, scraping, and synthesizing -information from multiple sources. - -The SearchScraper: -- Searches the web for relevant pages based on a query -- Scrapes multiple websites (3-20 configurable) -- Extracts and synthesizes information using AI -- Supports both AI extraction and markdown conversion modes -""" - -from enum import Enum -from typing import Optional, Type -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class TimeRange(str, Enum): - """Time range filter for search results. - - Controls how recent the search results should be. This is useful for - finding recent news, updates, or time-sensitive information. - - Values: - PAST_HOUR: Results from the past hour - PAST_24_HOURS: Results from the past 24 hours - PAST_WEEK: Results from the past week - PAST_MONTH: Results from the past month - PAST_YEAR: Results from the past year - """ - - PAST_HOUR = "past_hour" - PAST_24_HOURS = "past_24_hours" - PAST_WEEK = "past_week" - PAST_MONTH = "past_month" - PAST_YEAR = "past_year" - - -class SearchScraperRequest(BaseModel): - """ - Request model for the SearchScraper endpoint. - - This model validates and structures requests for web research and scraping - across multiple search results. - - Attributes: - user_prompt: The search query/prompt - num_results: Number of websites to scrape (3-20, default 3) - headers: Optional HTTP headers - output_schema: Optional Pydantic model for structured extraction - extraction_mode: Use AI extraction (True) or markdown (False) - mock: Whether to use mock mode for testing - render_heavy_js: Whether to render heavy JavaScript - location_geo_code: Optional geo code for location-based search (e.g., "us") - time_range: Optional time range filter for search results - - Example: - >>> request = SearchScraperRequest( - ... user_prompt="What is the latest version of Python?", - ... num_results=5, - ... extraction_mode=True - ... ) - """ - user_prompt: str = Field(..., example="What is the latest version of Python?") - num_results: Optional[int] = Field( - default=3, - ge=3, - le=20, - example=5, - description="Number of websites to scrape (3-20). Default is 3. More " - "websites provide better research depth but cost more credits.", - ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - output_schema: Optional[Type[BaseModel]] = None - extraction_mode: bool = Field( - default=True, - description="Whether to use AI extraction (True) or markdown conversion (False). " - "AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page.", - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - location_geo_code: Optional[str] = Field( - None, - description="The geo code of the location to search in", - example="us", - ) - time_range: Optional[TimeRange] = Field( - None, - description="The date range to filter search results", - examples=[ - TimeRange.PAST_HOUR, - TimeRange.PAST_24_HOURS, - TimeRange.PAST_WEEK, - TimeRange.PAST_MONTH, - TimeRange.PAST_YEAR, - ], - ) - - @model_validator(mode="after") - def validate_user_prompt(self) -> "SearchScraperRequest": - if self.user_prompt is None or not self.user_prompt.strip(): - raise ValueError("User prompt cannot be empty") - if not any(c.isalnum() for c in self.user_prompt): - raise ValueError("User prompt must contain a valid prompt") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - data = super().model_dump(*args, **kwargs) - # Convert the Pydantic model schema to dict if present - if self.output_schema is not None: - data["output_schema"] = self.output_schema.model_json_schema() - return data - - -class GetSearchScraperRequest(BaseModel): - """Request model for get_searchscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetSearchScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/shared.py b/scrapegraph-py/scrapegraph_py/models/shared.py new file mode 100644 index 0000000..dd9624e --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/shared.py @@ -0,0 +1,66 @@ +""" +Shared configuration models for the ScrapeGraphAI v2 API. + +These models are used across multiple endpoints for fetch and LLM configuration. +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +class FetchConfig(BaseModel): + """Configuration for how pages are fetched.""" + + mock: bool = Field(default=False, description="Use mock mode for testing") + stealth: bool = Field( + default=False, description="Enable stealth mode to avoid bot detection" + ) + scrolls: Optional[int] = Field( + default=None, ge=0, le=100, description="Number of scrolls to perform (0-100)" + ) + country: Optional[str] = Field( + default=None, description="Country code for geo-located requests (e.g. 'us')" + ) + cookies: Optional[Dict[str, str]] = Field( + default=None, description="Cookies to send with the request" + ) + headers: Optional[Dict[str, str]] = Field( + default=None, description="Custom HTTP headers to send with the request" + ) + wait_ms: Optional[int] = Field( + default=None, + ge=0, + description="Milliseconds to wait before scraping for JS rendering", + ) + render_js: bool = Field( + default=False, description="Whether to render heavy JavaScript" + ) + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) + + +class LlmConfig(BaseModel): + """Configuration for the LLM used in extraction.""" + + model: Optional[str] = Field( + default=None, description="LLM model to use for extraction" + ) + temperature: Optional[float] = Field( + default=None, + ge=0.0, + le=2.0, + description="Sampling temperature (0.0-2.0)", + ) + max_tokens: Optional[int] = Field( + default=None, ge=1, description="Maximum tokens in the response" + ) + chunker: Optional[str] = Field( + default=None, description="Chunking strategy for large pages" + ) + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/sitemap.py b/scrapegraph-py/scrapegraph_py/models/sitemap.py deleted file mode 100644 index 4095cbb..0000000 --- a/scrapegraph-py/scrapegraph_py/models/sitemap.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Models for sitemap endpoint""" - -from typing import Optional - -from pydantic import BaseModel, Field, model_validator - - -class SitemapRequest(BaseModel): - """Request model for sitemap endpoint. - - Extracts all URLs from a website's sitemap. Automatically discovers sitemap - from robots.txt or common sitemap locations like /sitemap.xml and sitemap - index files. - - The sitemap endpoint is useful for: - - Discovering all pages on a website - - Building comprehensive crawling lists - - SEO audits and analysis - - Content inventory management - - Attributes: - website_url (str): The base URL of the website to extract sitemap from. - Must start with http:// or https://. The API will automatically - discover the sitemap location. - mock (bool): Whether to use mock mode for the request. When True, returns - stubbed responses without making actual API calls. Defaults to False. - - Raises: - ValueError: If website_url is empty, None, or doesn't start with - http:// or https://. - - Examples: - Basic usage:: - - >>> request = SitemapRequest(website_url="https://example.com") - >>> print(request.website_url) - https://example.com - - With mock mode:: - - >>> request = SitemapRequest( - ... website_url="https://example.com", - ... mock=True - ... ) - >>> print(request.mock) - True - - The API automatically discovers sitemaps from: - - robots.txt directives (Sitemap: https://example.com/sitemap.xml) - - Common locations (/sitemap.xml, /sitemap_index.xml) - - Sitemap index files with nested sitemaps - - Note: - The website_url should be the base domain URL. The API will handle - sitemap discovery automatically. - """ - - website_url: str = Field( - ..., - example="https://scrapegraphai.com/", - description="The URL of the website to extract sitemap from" - ) - mock: bool = Field( - default=False, - description="Whether to use mock mode for the request" - ) - - @model_validator(mode="after") - def validate_url(self) -> "SitemapRequest": - """Validate the website URL. - - Ensures the URL is not empty and uses http:// or https:// protocol. - - Returns: - SitemapRequest: The validated instance. - - Raises: - ValueError: If URL is empty or uses invalid protocol. - """ - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("URL must start with http:// or https://") - return self - - def model_dump(self, *args, **kwargs) -> dict: - """Serialize the model to a dictionary. - - Automatically excludes None values from the serialized output to - produce cleaner JSON payloads for the API. - - Args: - *args: Positional arguments passed to parent model_dump. - **kwargs: Keyword arguments passed to parent model_dump. - If 'exclude_none' is not specified, it defaults to True. - - Returns: - dict: Dictionary representation of the model with None values excluded. - - Examples: - >>> request = SitemapRequest(website_url="https://example.com") - >>> data = request.model_dump() - >>> print(data) - {'website_url': 'https://example.com', 'mock': False} - """ - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class SitemapResponse(BaseModel): - """Response model for sitemap endpoint. - - Contains the complete list of URLs extracted from the website's sitemap. - The URLs are returned in the order they appear in the sitemap, which - typically reflects the website's intended structure and priority. - - This response is useful for: - - Building comprehensive URL lists for crawling - - Identifying content structure and organization - - Discovering all public pages on a website - - Planning content migration or archival - - Attributes: - urls (list[str]): Complete list of URLs extracted from the sitemap. - Each URL is a fully-qualified absolute URL string. The list may - be empty if no sitemap is found or if the sitemap contains no URLs. - URLs are deduplicated and ordered as they appear in the sitemap. - - Examples: - Basic usage:: - - >>> response = SitemapResponse(urls=[ - ... "https://example.com/", - ... "https://example.com/about" - ... ]) - >>> print(f"Found {len(response.urls)} URLs") - Found 2 URLs - - Iterating over URLs:: - - >>> response = SitemapResponse(urls=[ - ... "https://example.com/", - ... "https://example.com/products", - ... "https://example.com/contact" - ... ]) - >>> for url in response.urls: - ... print(url) - https://example.com/ - https://example.com/products - https://example.com/contact - - Filtering URLs:: - - >>> response = SitemapResponse(urls=[ - ... "https://example.com/", - ... "https://example.com/blog/post-1", - ... "https://example.com/blog/post-2", - ... "https://example.com/products" - ... ]) - >>> blog_urls = [url for url in response.urls if '/blog/' in url] - >>> print(f"Found {len(blog_urls)} blog posts") - Found 2 blog posts - - Empty sitemap:: - - >>> response = SitemapResponse(urls=[]) - >>> if not response.urls: - ... print("No URLs found in sitemap") - No URLs found in sitemap - - Note: - The urls list may contain various types of pages including: - - Homepage and main sections - - Blog posts and articles - - Product pages - - Category and tag pages - - Media files (images, PDFs) if included in sitemap - """ - - urls: list[str] = Field( - ..., - description="List of URLs extracted from the sitemap", - example=[ - "https://example.com/", - "https://example.com/about", - "https://example.com/products", - "https://example.com/contact" - ] - ) diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py deleted file mode 100644 index e68b2d8..0000000 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -Pydantic models for the SmartScraper API endpoint. - -This module defines request and response models for the SmartScraper endpoint, -which performs AI-powered web scraping with optional pagination and scrolling support. - -The SmartScraper can: -- Extract structured data from websites based on user prompts -- Handle infinite scroll scenarios -- Support pagination across multiple pages -- Accept custom output schemas for structured extraction -- Process URLs, raw HTML content, or Markdown content -""" - -from typing import Dict, Optional, Type -from uuid import UUID - -try: - from bs4 import BeautifulSoup - HAS_BS4 = True -except ImportError: - HAS_BS4 = False - -from pydantic import BaseModel, Field, conint, model_validator - - -class SmartScraperRequest(BaseModel): - """ - Request model for the SmartScraper endpoint. - - This model validates and structures requests for AI-powered web scraping. - You must provide exactly one of: website_url, website_html, or website_markdown. - - Attributes: - user_prompt: Natural language prompt describing what to extract - website_url: URL of the website to scrape (optional) - website_html: Raw HTML content to scrape (optional, max 2MB) - website_markdown: Markdown content to process (optional, max 2MB) - headers: Optional HTTP headers including cookies - cookies: Optional cookies for authentication/session management - output_schema: Optional Pydantic model defining the output structure - number_of_scrolls: Number of times to scroll (0-100) for infinite scroll pages - total_pages: Number of pages to scrape (1-10) for pagination - mock: Whether to use mock mode for testing - plain_text: Whether to return plain text instead of structured data - render_heavy_js: Whether to render heavy JavaScript content - - Example: - >>> request = SmartScraperRequest( - ... website_url="https://example.com", - ... user_prompt="Extract all product names and prices" - ... ) - """ - user_prompt: str = Field( - ..., - example="Extract info about the company", - ) - website_url: Optional[str] = Field( - default=None, example="https://scrapegraphai.com/" - ) - website_html: Optional[str] = Field( - default=None, - example="

Title

Content

", - description="HTML content, maximum size 2MB", - ) - website_markdown: Optional[str] = Field( - default=None, - example="# Title\n\nContent goes here", - description="Markdown content, maximum size 2MB", - ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - cookies: Optional[Dict[str, str]] = Field( - None, - example={"session_id": "abc123", "user_token": "xyz789"}, - description="Dictionary of cookies to send with the request for " - "authentication or session management", - ) - output_schema: Optional[Type[BaseModel]] = None - number_of_scrolls: Optional[conint(ge=0, le=100)] = Field( - default=None, - description="Number of times to scroll the page (0-100). If None, no " - "scrolling will be performed.", - example=10, - ) - total_pages: Optional[conint(ge=1, le=10)] = Field( - default=None, - description="Number of pages to scrape (1-10). If None, only the first " - "page will be scraped.", - example=5, - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - plain_text: bool = Field(default=False, description="Whether to return the result as plain text") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") - - @model_validator(mode="after") - def validate_user_prompt(self) -> "SmartScraperRequest": - if self.user_prompt is None or not self.user_prompt.strip(): - raise ValueError("User prompt cannot be empty") - if not any(c.isalnum() for c in self.user_prompt): - raise ValueError("User prompt must contain a valid prompt") - return self - - @model_validator(mode="after") - def validate_url_and_html(self) -> "SmartScraperRequest": - # Count how many input sources are provided - inputs_provided = sum([ - self.website_url is not None, - self.website_html is not None, - self.website_markdown is not None - ]) - - if inputs_provided == 0: - raise ValueError("Exactly one of website_url, website_html, or website_markdown must be provided") - elif inputs_provided > 1: - raise ValueError("Only one of website_url, website_html, or website_markdown can be provided") - - # Validate HTML content - if self.website_html is not None: - if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024: - raise ValueError("Website HTML content exceeds maximum size of 2MB") - if not HAS_BS4: - raise ImportError( - "beautifulsoup4 is required for HTML validation. " - "Install it with: pip install scrapegraph-py[html] or pip install beautifulsoup4" - ) - try: - soup = BeautifulSoup(self.website_html, "html.parser") - if not soup.find(): - raise ValueError("Invalid HTML - no parseable content found") - except Exception as e: - if isinstance(e, ImportError): - raise - raise ValueError(f"Invalid HTML structure: {str(e)}") - - # Validate URL - elif self.website_url is not None: - if not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") - - # Validate Markdown content - elif self.website_markdown is not None: - if not self.website_markdown.strip(): - raise ValueError("Website markdown cannot be empty") - if len(self.website_markdown.encode("utf-8")) > 2 * 1024 * 1024: - raise ValueError("Website markdown content exceeds maximum size of 2MB") - - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - data = super().model_dump(*args, **kwargs) - # Convert the Pydantic model schema to dict if present - if self.output_schema is not None: - data["output_schema"] = self.output_schema.model_json_schema() - return data - - -class GetSmartScraperRequest(BaseModel): - """Request model for get_smartscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetSmartScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/utils/helpers.py b/scrapegraph-py/scrapegraph_py/utils/helpers.py index 8e0e061..04a657d 100644 --- a/scrapegraph-py/scrapegraph_py/utils/helpers.py +++ b/scrapegraph-py/scrapegraph_py/utils/helpers.py @@ -1,12 +1,8 @@ """ -Helper utility functions for the ScrapeGraphAI SDK. - -This module provides utility functions for API key validation and -HTTP response handling for both synchronous and asynchronous requests. +Helper utility functions for the ScrapeGraphAI SDK v2. """ from typing import Any, Dict -from uuid import UUID import aiohttp from requests import Response @@ -15,10 +11,7 @@ def validate_api_key(api_key: str) -> bool: - """ - Validate the format of a ScrapeGraphAI API key. - - API keys must follow the format: 'sgai-' followed by a valid UUID. + """Validate that an API key is present and non-empty. Args: api_key: The API key string to validate @@ -27,32 +20,18 @@ def validate_api_key(api_key: str) -> bool: True if the API key is valid Raises: - ValueError: If the API key format is invalid - - Example: - >>> validate_api_key("sgai-12345678-1234-1234-1234-123456789abc") - True - >>> validate_api_key("invalid-key") - ValueError: Invalid API key format... + ValueError: If the API key is empty or missing """ - if not api_key.startswith("sgai-"): - raise ValueError("Invalid API key format. API key must start with 'sgai-'") - uuid_part = api_key[5:] # Strip out 'sgai-' - try: - UUID(uuid_part) - except ValueError: + if not api_key or not api_key.strip(): raise ValueError( - "Invalid API key format. API key must be 'sgai-' followed by a valid UUID. " - "You can get one at https://dashboard.scrapegraphai.com/" + "API key cannot be empty. " + "Get one at https://dashboard.scrapegraphai.com/" ) return True def handle_sync_response(response: Response) -> Dict[str, Any]: - """ - Handle and parse synchronous HTTP responses. - - Parses the JSON response and raises APIError for error status codes. + """Handle and parse synchronous HTTP responses. Args: response: The requests Response object @@ -62,15 +41,10 @@ def handle_sync_response(response: Response) -> Dict[str, Any]: Raises: APIError: If the response status code indicates an error (>= 400) - - Example: - >>> response = requests.get("https://api.example.com/data") - >>> data = handle_sync_response(response) """ try: data = response.json() except ValueError: - # If response is not JSON, use the raw text data = {"error": response.text} if response.status_code >= 400: @@ -83,10 +57,7 @@ def handle_sync_response(response: Response) -> Dict[str, Any]: async def handle_async_response(response: aiohttp.ClientResponse) -> Dict[str, Any]: - """ - Handle and parse asynchronous HTTP responses. - - Parses the JSON response and raises APIError for error status codes. + """Handle and parse asynchronous HTTP responses. Args: response: The aiohttp ClientResponse object @@ -96,16 +67,11 @@ async def handle_async_response(response: aiohttp.ClientResponse) -> Dict[str, A Raises: APIError: If the response status code indicates an error (>= 400) - - Example: - >>> async with session.get("https://api.example.com/data") as response: - ... data = await handle_async_response(response) """ try: data = await response.json() text = None except ValueError: - # If response is not JSON, use the raw text text = await response.text() data = {"error": text} diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index 592cfc5..49ba234 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -1,838 +1,327 @@ -import asyncio +"""Tests for the asynchronous AsyncClient (v2 API).""" + from uuid import uuid4 import pytest +import pytest_asyncio from aioresponses import aioresponses -from pydantic import BaseModel +from pydantic import BaseModel, Field from scrapegraph_py.async_client import AsyncClient -from scrapegraph_py.exceptions import APIError +from scrapegraph_py.config import API_BASE_URL from tests.utils import generate_mock_api_key @pytest.fixture -def mock_api_key(): +def api_key(): return generate_mock_api_key() -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - - -@pytest.mark.asyncio -async def test_smartscraper_with_url(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert response["status"] == "completed" - assert "description" in response["result"] - +@pytest_asyncio.fixture +async def client(api_key): + c = AsyncClient(api_key=api_key) + yield c + await c.close() -@pytest.mark.asyncio -async def test_smartscraper_with_html(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Test content."}, - }, - ) - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_html="

Test content

", - user_prompt="Extract info", - ) - assert response["status"] == "completed" - assert "description" in response["result"] +# ------------------------------------------------------------------ +# Auth & headers +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_smartscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) +async def test_bearer_auth_header(api_key): + c = AsyncClient(api_key=api_key) + assert c.headers["Authorization"] == f"Bearer {api_key}" + assert c.headers["X-SDK-Version"].startswith("python@") + await c.close() - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert response["status"] == "completed" - assert "description" in response["result"] +# ------------------------------------------------------------------ +# Scrape +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_get_credits(mock_api_key): +async def test_scrape(client): with aioresponses() as mocked: - mocked.get( - "https://api.scrapegraphai.com/v1/credits", - payload={"remaining_credits": 100, "total_credits_used": 50}, + mocked.post( + f"{API_BASE_URL}/scrape", + payload={"request_id": str(uuid4()), "content": "# Hello"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_credits() - assert response["remaining_credits"] == 100 - assert response["total_credits_used"] == 50 + result = await client.scrape("https://example.com") + assert "content" in result @pytest.mark.asyncio -async def test_submit_feedback(mock_api_key): +async def test_scrape_html_format(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/feedback", payload={"status": "success"} + f"{API_BASE_URL}/scrape", + payload={"request_id": str(uuid4()), "content": "

Hello

"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.submit_feedback( - request_id=str(uuid4()), rating=5, feedback_text="Great service!" - ) - assert response["status"] == "success" + result = await client.scrape("https://example.com", format="html") + assert "content" in result -@pytest.mark.asyncio -async def test_get_smartscraper(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "test"}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid +# ------------------------------------------------------------------ +# Extract +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_smartscraper_with_pagination(mock_api_key): +async def test_extract(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - ] - }, - }, + f"{API_BASE_URL}/extract", + payload={"request_id": str(uuid4()), "result": {"title": "Example"}}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3, - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 3 - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_scrolls(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - {"name": "Product 4", "price": "$40"}, - {"name": "Product 5", "price": "$50"}, - ] - }, - }, + result = await client.extract( + url="https://example.com", + prompt="Extract the title", ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information from paginated results", - total_pages=5, - number_of_scrolls=10, - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 5 + assert result["result"]["title"] == "Example" @pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_all_features(mock_api_key): +async def test_extract_with_pydantic_schema(client): + class Product(BaseModel): + name: str = Field(description="Product name") + price: float = Field(description="Product price") + with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", + f"{API_BASE_URL}/extract", payload={ "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10", "rating": 4.5}, - {"name": "Product 2", "price": "$20", "rating": 4.0}, - ] - }, + "result": {"name": "Widget", "price": 9.99}, }, ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - class ProductSchema(BaseModel): - name: str - price: str - rating: float - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information with ratings", - headers=headers, - output_schema=ProductSchema, - number_of_scrolls=5, - total_pages=2, - ) - assert response["status"] == "completed" - assert "products" in response["result"] - - -@pytest.mark.asyncio -async def test_api_error(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - status=400, - payload={"error": "Bad request"}, - exception=APIError("Bad request", status_code=400), + result = await client.extract( + url="https://example.com", + prompt="Extract product info", + output_schema=Product, ) + assert result["result"]["name"] == "Widget" - async with AsyncClient(api_key=mock_api_key) as client: - with pytest.raises(APIError) as exc_info: - await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert exc_info.value.status_code == 400 - assert "Bad request" in str(exc_info.value) - - -@pytest.mark.asyncio -async def test_markdownify(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify(website_url="https://example.com") - assert response["status"] == "completed" - assert "# Example Page" in response["result"] +# ------------------------------------------------------------------ +# Search +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_markdownify_with_headers(mock_api_key): +async def test_search(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", + f"{API_BASE_URL}/search", payload={ "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", + "results": [{"url": "https://example.com"}], }, ) + result = await client.search("best web scrapers 2025") + assert "results" in result - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify( - website_url="https://example.com", headers=headers - ) - assert response["status"] == "completed" - assert "# Example Page" in response["result"] +# ------------------------------------------------------------------ +# Credits +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_get_markdownify(mock_api_key, mock_uuid): +async def test_credits(client): with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_searchscraper(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?" - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_searchscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, + f"{API_BASE_URL}/credits", + payload={"remaining_credits": 1000, "total_credits_used": 50}, ) + result = await client.credits() + assert result["remaining_credits"] == 1000 - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) +# ------------------------------------------------------------------ +# History +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_get_searchscraper(mock_api_key, mock_uuid): +async def test_history(client): with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, + f"{API_BASE_URL}/history", + payload={"requests": [], "total": 0}, ) + result = await client.history() + assert "requests" in result - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) - - -@pytest.mark.asyncio -async def test_crawl(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert response["status"] == "processing" - assert "id" in response +# ------------------------------------------------------------------ +# Crawl namespace +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_crawl_with_minimal_params(mock_api_key): +async def test_crawl_start(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, + f"{API_BASE_URL}/crawl", + payload={"id": crawl_id, "status": "running"}, ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - ) - assert response["status"] == "processing" - assert "id" in response + result = await client.crawl.start("https://example.com", depth=3) + assert result["id"] == crawl_id @pytest.mark.asyncio -async def test_get_crawl(mock_api_key, mock_uuid): +async def test_crawl_status(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - payload={ - "id": mock_uuid, - "status": "completed", - "result": { - "llm_result": { - "company": { - "name": "Example Corp", - "description": "A technology company", - }, - "services": [ - { - "service_name": "Web Development", - "description": "Custom web solutions", - } - ], - "legal": { - "privacy_policy": "Privacy policy content", - "terms_of_service": "Terms of service content", - }, - } - }, - }, + f"{API_BASE_URL}/crawl/{crawl_id}", + payload={"id": crawl_id, "status": "completed", "pages": []}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["id"] == mock_uuid - assert "result" in response - assert "llm_result" in response["result"] + result = await client.crawl.status(crawl_id) + assert result["status"] == "completed" @pytest.mark.asyncio -async def test_crawl_markdown_mode(mock_api_key): - """Test async crawl in markdown conversion mode (no AI processing)""" +async def test_crawl_stop(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Markdown crawl job started", - }, + f"{API_BASE_URL}/crawl/{crawl_id}/stop", + payload={"id": crawl_id, "status": "stopped"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - extraction_mode=False, # Markdown conversion mode - depth=2, - max_pages=3, - same_domain_only=True, - sitemap=True, - ) - assert response["status"] == "processing" - assert "id" in response + result = await client.crawl.stop(crawl_id) + assert result["status"] == "stopped" @pytest.mark.asyncio -async def test_crawl_markdown_mode_validation(mock_api_key): - """Test that async markdown mode rejects prompt and data_schema parameters""" - async with AsyncClient(api_key=mock_api_key) as client: - # Should raise validation error when prompt is provided in markdown mode - try: - await client.crawl( - url="https://example.com", - extraction_mode=False, - prompt="This should not be allowed", - ) - assert False, "Should have raised validation error" - except Exception as e: - assert "Prompt should not be provided when extraction_mode=False" in str(e) - - # Should raise validation error when data_schema is provided in markdown mode - try: - await client.crawl( - url="https://example.com", - extraction_mode=False, - data_schema={"type": "object"}, - ) - assert False, "Should have raised validation error" - except Exception as e: - assert ( - "Data schema should not be provided when extraction_mode=False" - in str(e) - ) - - -# ============================================================================ -# ASYNC SCRAPE TESTS -# ============================================================================ - - -@pytest.mark.asyncio -async def test_async_scrape_basic(mock_api_key): - """Test basic async scrape request""" +async def test_crawl_resume(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "

Example Page

This is HTML content.

", - }, + f"{API_BASE_URL}/crawl/{crawl_id}/resume", + payload={"id": crawl_id, "status": "running"}, ) + result = await client.crawl.resume(crawl_id) + assert result["status"] == "running" - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape(website_url="https://example.com") - assert response["status"] == "completed" - assert "html" in response - assert "

Example Page

" in response["html"] - - -@pytest.mark.asyncio -async def test_async_scrape_with_heavy_js(mock_api_key): - """Test async scrape request with heavy JavaScript rendering""" - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "
JavaScript rendered content
", - }, - ) - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape( - website_url="https://example.com", - render_heavy_js=True - ) - assert response["status"] == "completed" - assert "html" in response - assert "JavaScript rendered content" in response["html"] +# ------------------------------------------------------------------ +# Monitor namespace +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_async_scrape_with_headers(mock_api_key): - """Test async scrape request with custom headers""" +async def test_monitor_create(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "

Content with custom headers

", - }, + f"{API_BASE_URL}/monitor", + payload={"id": monitor_id, "name": "Price Monitor"}, ) - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Cookie": "session=123" - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape( - website_url="https://example.com", - headers=headers - ) - assert response["status"] == "completed" - assert "html" in response + result = await client.monitor.create( + name="Price Monitor", + url="https://example.com/products", + prompt="Extract product prices", + cron="0 9 * * 1", + ) + assert result["name"] == "Price Monitor" @pytest.mark.asyncio -async def test_async_scrape_with_all_options(mock_api_key): - """Test async scrape request with all options enabled""" +async def test_monitor_list(client): with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "
Full featured content
", - }, + mocked.get( + f"{API_BASE_URL}/monitor", + payload={"monitors": [], "total": 0}, ) - - headers = { - "User-Agent": "Custom Agent", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape( - website_url="https://example.com", - render_heavy_js=True, - headers=headers - ) - assert response["status"] == "completed" - assert "html" in response + result = await client.monitor.list() + assert "monitors" in result @pytest.mark.asyncio -async def test_async_get_scrape(mock_api_key, mock_uuid): - """Test async get scrape result""" +async def test_monitor_get(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/scrape/{mock_uuid}", - payload={ - "scrape_request_id": mock_uuid, - "status": "completed", - "html": "

Retrieved HTML content

", - }, + f"{API_BASE_URL}/monitor/{monitor_id}", + payload={"id": monitor_id, "name": "Test Monitor"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_scrape(mock_uuid) - assert response["status"] == "completed" - assert response["scrape_request_id"] == mock_uuid - assert "html" in response + result = await client.monitor.get(monitor_id) + assert result["id"] == monitor_id @pytest.mark.asyncio -async def test_async_scrape_error_response(mock_api_key): - """Test async scrape error response handling""" +async def test_monitor_pause(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "error": "Website not accessible", - "status": "error" - }, - status=400 + f"{API_BASE_URL}/monitor/{monitor_id}/pause", + payload={"id": monitor_id, "status": "paused"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - with pytest.raises(Exception): - await client.scrape(website_url="https://inaccessible-site.com") + result = await client.monitor.pause(monitor_id) + assert result["status"] == "paused" @pytest.mark.asyncio -async def test_async_scrape_processing_status(mock_api_key): - """Test async scrape processing status response""" +async def test_monitor_resume(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "processing", - "message": "Scrape job started" - }, + f"{API_BASE_URL}/monitor/{monitor_id}/resume", + payload={"id": monitor_id, "status": "active"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape(website_url="https://example.com") - assert response["status"] == "processing" - assert "scrape_request_id" in response + result = await client.monitor.resume(monitor_id) + assert result["status"] == "active" @pytest.mark.asyncio -async def test_async_scrape_complex_html_response(mock_api_key): - """Test async scrape with complex HTML response""" - complex_html = """ - - - - - - Complex Page - - - -
- -
-
-

Welcome

-

This is a complex HTML page with multiple elements.

-
- Sample image - - -
Data 1Data 2
-
-
- - - - """ - +async def test_monitor_delete(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": complex_html, - }, + mocked.delete( + f"{API_BASE_URL}/monitor/{monitor_id}", + payload={"message": "deleted"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape(website_url="https://complex-example.com") - assert response["status"] == "completed" - assert "html" in response - assert "" in response["html"] - assert "Complex Page" in response["html"] - assert " - - - """ - +def test_monitor_delete(client): + monitor_id = str(uuid4()) responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/scrape", - json={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": complex_html, - }, + responses.DELETE, + f"{API_BASE_URL}/monitor/{monitor_id}", + json={"message": "deleted"}, ) + result = client.monitor.delete(monitor_id) + assert result["message"] == "deleted" - with Client(api_key=mock_api_key) as client: - response = client.scrape(website_url="https://complex-example.com") - assert response["status"] == "completed" - assert "html" in response - assert "" in response["html"] - assert "Complex Page" in response["html"] - assert "