diff --git a/docs.json b/docs.json index 282bcf6..695eb2d 100644 --- a/docs.json +++ b/docs.json @@ -3,250 +3,322 @@ "theme": "mint", "name": "ScrapeGraphAI", "colors": { - "primary": "#9333ea", - "light": "#9f52eb", - "dark": "#1f2937" + "primary": "#AC6DFF", + "light": "#AC6DFF", + "dark": "#AC6DFF" }, "favicon": "/favicon.svg", "navigation": { - "tabs": [ + "versions": [ { - "tab": "Home", - "groups": [ + "version": "v2", + "default": true, + "tabs": [ { - "group": "Get Started", - "pages": [ - "introduction", - "install", + "tab": "Home", + "groups": [ { - "group": "Use Cases", + "group": "Get Started", "pages": [ - "use-cases/overview", - "use-cases/ai-llm", - "use-cases/lead-generation", - "use-cases/market-intelligence", - "use-cases/content-aggregation", - "use-cases/research-analysis", - "use-cases/seo-analytics" + "introduction", + "install", + { + "group": "Use Cases", + "pages": [ + "use-cases/overview", + "use-cases/ai-llm", + "use-cases/lead-generation", + "use-cases/market-intelligence", + "use-cases/content-aggregation", + "use-cases/research-analysis", + "use-cases/seo-analytics" + ] + }, + { + "group": "Dashboard", + "pages": [ + "dashboard/overview", + "dashboard/settings" + ] + } ] }, { - "group": "Dashboard", + "group": "Services", "pages": [ - "dashboard/overview", - "dashboard/playground", - "dashboard/settings" + "services/scrape", + "services/extract", + "services/search", + "services/crawl", + "services/monitor", + { + "group": "CLI", + "icon": "terminal", + "pages": [ + "services/cli/introduction", + "services/cli/commands", + "services/cli/json-mode", + "services/cli/ai-agent-skill", + "services/cli/examples" + ] + }, + { + "group": "MCP Server", + "icon": "/logo/mcp.svg", + "pages": [ + "services/mcp-server/introduction", + "services/mcp-server/cursor", + "services/mcp-server/claude", + "services/mcp-server/smithery" + ] + }, + "services/toonify", + { + "group": "Additional Parameters", + "pages": [ + "services/additional-parameters/headers", + "services/additional-parameters/pagination", + "services/additional-parameters/proxy", + "services/additional-parameters/wait-ms" + ] + } + ] + }, + { + "group": "Official SDKs", + "pages": [ + "sdks/python", + "sdks/javascript", + "sdks/mocking" + ] + }, + { + "group": "Integrations", + "pages": [ + "integrations/langchain", + "integrations/llamaindex", + "integrations/crewai", + "integrations/agno", + "integrations/vercel_ai", + "integrations/google-adk" + ] + }, + { + "group": "LLM SDKs & Frameworks", + "pages": [ + "developer-guides/llm-sdks-and-frameworks/gemini", + "developer-guides/llm-sdks-and-frameworks/anthropic" + ] + }, + { + "group": "Contribute", + "pages": [ + "contribute/opensource" ] } ] }, { - "group": "Services", - "pages": [ - "services/smartscraper", - "services/searchscraper", - "services/markdownify", - "services/scrape", - "services/smartcrawler", - "services/sitemap", - "services/agenticscraper", + "tab": "Knowledge Base", + "groups": [ + { + "group": "Knowledge Base", + "pages": [ + "knowledge-base/introduction" + ] + }, + { + "group": "Scraping Tools", + "pages": [ + "knowledge-base/ai-tools/lovable", + "knowledge-base/ai-tools/v0", + "knowledge-base/ai-tools/bolt", + "knowledge-base/ai-tools/cursor" + ] + }, { "group": "CLI", - "icon": "terminal", "pages": [ - "services/cli/introduction", - "services/cli/commands", - "services/cli/json-mode", - "services/cli/ai-agent-skill", - "services/cli/examples" + "knowledge-base/cli/getting-started", + "knowledge-base/cli/json-mode", + "knowledge-base/cli/ai-agent-skill", + "knowledge-base/cli/command-examples" + ] + }, + { + "group": "Troubleshooting", + "pages": [ + "knowledge-base/troubleshooting/cors-error", + "knowledge-base/troubleshooting/empty-results", + "knowledge-base/troubleshooting/rate-limiting", + "knowledge-base/troubleshooting/timeout-errors" ] }, { - "group": "MCP Server", - "icon": "/logo/mcp.svg", + "group": "Scraping Guides", "pages": [ - "services/mcp-server/introduction", - "services/mcp-server/cursor", - "services/mcp-server/claude", - "services/mcp-server/smithery" + "knowledge-base/scraping/javascript-rendering", + "knowledge-base/scraping/pagination", + "knowledge-base/scraping/custom-headers", + "knowledge-base/scraping/proxy" ] }, - "services/toonify", { - "group": "Additional Parameters", + "group": "Account & Credits", "pages": [ - "services/additional-parameters/headers", - "services/additional-parameters/pagination", - "services/additional-parameters/proxy", - "services/additional-parameters/wait-ms" + "knowledge-base/account/api-keys", + "knowledge-base/account/credits", + "knowledge-base/account/rate-limits" ] } ] }, { - "group": "Official SDKs", - "pages": [ - "sdks/python", - "sdks/javascript", - "sdks/mocking" - ] - }, - { - "group": "Integrations", - "pages": [ - "integrations/langchain", - "integrations/llamaindex", - "integrations/crewai", - "integrations/agno", - "integrations/langflow", - "integrations/vercel_ai", - "integrations/google-adk", - "integrations/x402" - ] - }, - { - "group": "LLM SDKs & Frameworks", - "pages": [ - "developer-guides/llm-sdks-and-frameworks/gemini", - "developer-guides/llm-sdks-and-frameworks/anthropic" - ] - }, - { - "group": "Contribute", - "pages": [ - "contribute/opensource" - ] - } - ] - }, - { - "tab": "Knowledge Base", - "groups": [ - { - "group": "Knowledge Base", - "pages": [ - "knowledge-base/introduction" - ] - }, - { - "group": "Scraping Tools", - "pages": [ - "knowledge-base/ai-tools/lovable", - "knowledge-base/ai-tools/v0", - "knowledge-base/ai-tools/bolt", - "knowledge-base/ai-tools/cursor" - ] - }, - { - "group": "CLI", - "pages": [ - "knowledge-base/cli/getting-started", - "knowledge-base/cli/json-mode", - "knowledge-base/cli/ai-agent-skill", - "knowledge-base/cli/command-examples" - ] - }, - { - "group": "Troubleshooting", - "pages": [ - "knowledge-base/troubleshooting/cors-error", - "knowledge-base/troubleshooting/empty-results", - "knowledge-base/troubleshooting/rate-limiting", - "knowledge-base/troubleshooting/timeout-errors" - ] - }, - { - "group": "Scraping Guides", - "pages": [ - "knowledge-base/scraping/javascript-rendering", - "knowledge-base/scraping/pagination", - "knowledge-base/scraping/custom-headers", - "knowledge-base/scraping/proxy" - ] - }, - { - "group": "Account & Credits", - "pages": [ - "knowledge-base/account/api-keys", - "knowledge-base/account/credits", - "knowledge-base/account/rate-limits" - ] - } - ] - }, - { - "tab": "Cookbook", - "groups": [ - { - "group": "Cookbook", - "pages": [ - "cookbook/introduction" + "tab": "Cookbook", + "groups": [ + { + "group": "Cookbook", + "pages": [ + "cookbook/introduction" + ] + }, + { + "group": "Examples", + "pages": [ + "cookbook/examples/company-info", + "cookbook/examples/github-trending", + "cookbook/examples/wired", + "cookbook/examples/homes", + "cookbook/examples/research-agent", + "cookbook/examples/chat-webpage", + "cookbook/examples/pagination" + ] + } ] }, { - "group": "Examples", - "pages": [ - "cookbook/examples/company-info", - "cookbook/examples/github-trending", - "cookbook/examples/wired", - "cookbook/examples/homes", - "cookbook/examples/research-agent", - "cookbook/examples/chat-webpage", - "cookbook/examples/pagination" + "tab": "API Reference", + "groups": [ + { + "group": "API Documentation", + "pages": [ + "api-reference/introduction", + "api-reference/errors" + ] + }, + { + "group": "SmartScraper", + "pages": [ + "api-reference/endpoint/smartscraper/start", + "api-reference/endpoint/smartscraper/get-status" + ] + }, + { + "group": "SearchScraper", + "pages": [ + "api-reference/endpoint/searchscraper/start", + "api-reference/endpoint/searchscraper/get-status" + ] + }, + { + "group": "Markdownify", + "pages": [ + "api-reference/endpoint/markdownify/start", + "api-reference/endpoint/markdownify/get-status" + ] + }, + { + "group": "SmartCrawler", + "pages": [ + "api-reference/endpoint/smartcrawler/start", + "api-reference/endpoint/smartcrawler/get-status" + ] + }, + { + "group": "Sitemap", + "pages": [ + "api-reference/endpoint/sitemap/start", + "api-reference/endpoint/sitemap/get-status" + ] + }, + { + "group": "User", + "pages": [ + "api-reference/endpoint/user/get-credits", + "api-reference/endpoint/user/submit-feedback" + ] + } ] } ] }, { - "tab": "API Reference", - "groups": [ + "version": "v1", + "tabs": [ { - "group": "API Documentation", - "pages": [ - "api-reference/introduction", - "api-reference/errors" - ] - }, - { - "group": "SmartScraper", - "pages": [ - "api-reference/endpoint/smartscraper/start", - "api-reference/endpoint/smartscraper/get-status" - ] - }, - { - "group": "SearchScraper", - "pages": [ - "api-reference/endpoint/searchscraper/start", - "api-reference/endpoint/searchscraper/get-status" - ] - }, - { - "group": "Markdownify", - "pages": [ - "api-reference/endpoint/markdownify/start", - "api-reference/endpoint/markdownify/get-status" - ] - }, - { - "group": "SmartCrawler", - "pages": [ - "api-reference/endpoint/smartcrawler/start", - "api-reference/endpoint/smartcrawler/get-status" - ] - }, - { - "group": "Sitemap", - "pages": [ - "api-reference/endpoint/sitemap/start", - "api-reference/endpoint/sitemap/get-status" + "tab": "Home", + "groups": [ + { + "group": "Get Started", + "pages": [ + "v1/introduction", + "v1/quickstart" + ] + }, + { + "group": "Services", + "pages": [ + "v1/smartscraper", + "v1/searchscraper", + "v1/markdownify", + "v1/scrape", + "v1/smartcrawler", + "v1/sitemap", + "v1/agenticscraper", + { + "group": "CLI", + "icon": "terminal", + "pages": [ + "v1/cli/introduction", + "v1/cli/commands", + "v1/cli/json-mode", + "v1/cli/ai-agent-skill", + "v1/cli/examples" + ] + }, + { + "group": "MCP Server", + "icon": "/logo/mcp.svg", + "pages": [ + "v1/mcp-server/introduction", + "v1/mcp-server/cursor", + "v1/mcp-server/claude", + "v1/mcp-server/smithery" + ] + }, + "v1/toonify", + { + "group": "Additional Parameters", + "pages": [ + "v1/additional-parameters/headers", + "v1/additional-parameters/pagination", + "v1/additional-parameters/proxy", + "v1/additional-parameters/wait-ms" + ] + } + ] + } ] }, { - "group": "User", - "pages": [ - "api-reference/endpoint/user/get-credits", - "api-reference/endpoint/user/submit-feedback" + "tab": "API Reference", + "groups": [ + { + "group": "API Documentation", + "pages": [ + "v1/api-reference/introduction" + ] + } ] } ] @@ -259,12 +331,7 @@ "href": "https://scrapegraphai.com/", "icon": "globe" }, - { - "anchor": "Community", - "href": "https://discord.gg/uJN7TYcpNa", - "icon": "discord" - }, - { +{ "anchor": "Blog", "href": "https://scrapegraphai.com/blog", "icon": "newspaper" @@ -279,7 +346,18 @@ }, "background": { "color": { - "dark": "#101725" + "dark": "#242424", + "light": "#EFEFEF" + } + }, + "fonts": { + "heading": { + "family": "IBM Plex Sans", + "weight": 500 + }, + "body": { + "family": "IBM Plex Sans", + "weight": 400 } }, "navbar": { @@ -293,7 +371,7 @@ "href": "mailto:contact@scrapegraphai.com" }, { - "label": "⭐ 23.2k+", + "label": "⭐ 23k+", "href": "https://github.com/ScrapeGraphAI/Scrapegraph-ai" } ], @@ -322,4 +400,4 @@ "vscode" ] } -} \ No newline at end of file +} diff --git a/install.md b/install.md index 1f1165d..f39b9d0 100644 --- a/install.md +++ b/install.md @@ -1,6 +1,6 @@ --- title: Installation -description: 'Install and get started with ScrapeGraphAI SDKs' +description: 'Install and get started with ScrapeGraphAI v2 SDKs' --- ## Prerequisites @@ -22,10 +22,10 @@ from scrapegraph_py import Client client = Client(api_key="your-api-key-here") -# Scrape a website -response = client.smartscraper( - website_url="https://scrapegraphai.com", - user_prompt="Extract information about the company" +# Extract data from a website +response = client.extract( + url="https://scrapegraphai.com", + prompt="Extract information about the company" ) print(response) ``` @@ -40,6 +40,8 @@ For more advanced usage, see the [Python SDK documentation](/sdks/python). ## JavaScript SDK +Requires **Node.js >= 22**. + Install using npm, pnpm, yarn, or bun: ```bash @@ -59,20 +61,16 @@ bun add scrapegraph-js **Usage:** ```javascript -import { smartScraper } from "scrapegraph-js"; +import scrapegraphai from "scrapegraph-js"; -const apiKey = "your-api-key-here"; +const sgai = scrapegraphai({ apiKey: "your-api-key-here" }); -const response = await smartScraper(apiKey, { - website_url: "https://scrapegraphai.com", - user_prompt: "What does the company do?", -}); +const { data } = await sgai.extract( + "https://scrapegraphai.com", + { prompt: "What does the company do?" } +); -if (response.status === "error") { - console.error("Error:", response.error); -} else { - console.log(response.data.result); -} +console.log(data); ``` @@ -85,18 +83,18 @@ For more advanced usage, see the [JavaScript SDK documentation](/sdks/javascript ## Key Concepts -### SmartScraper +### Extract (formerly SmartScraper) Extract specific information from any webpage using AI. Provide a URL and a prompt describing what you want to extract. [Learn more](/services/smartscraper) -### SearchScraper -Search and extract information from multiple web sources using AI. Start with just a prompt - SearchScraper will find relevant websites and extract the information you need. [Learn more](/services/searchscraper) +### Search (formerly SearchScraper) +Search and extract information from multiple web sources using AI. Start with just a query - Search will find relevant websites and extract the information you need. [Learn more](/services/searchscraper) + +### Scrape +Convert any webpage into markdown, HTML, screenshot, or branding format. Replaces the previous Markdownify endpoint with additional output formats. [Learn more](/services/scrape) ### SmartCrawler AI-powered extraction for any webpage with crawl capabilities. Automatically navigate and extract data from multiple pages. [Learn more](/services/smartcrawler) -### Markdownify -Convert any webpage into clean, formatted markdown. Perfect for content aggregation and processing. [Learn more](/services/markdownify) - ### Structured Output with Schemas Both SDKs support structured output using schemas: - **Python**: Use Pydantic models @@ -119,34 +117,37 @@ class CompanyInfo(BaseModel): industry: str = Field(description="Industry sector") client = Client(api_key="your-api-key") -result = client.smartscraper( - website_url="https://scrapegraphai.com", - user_prompt="Extract company information", +response = client.extract( + url="https://scrapegraphai.com", + prompt="Extract company information", output_schema=CompanyInfo ) -print(result) +print(response) ``` ### JavaScript Example ```javascript -import { smartScraper } from "scrapegraph-js"; +import scrapegraphai from "scrapegraph-js"; import { z } from "zod"; +const sgai = scrapegraphai({ apiKey: "your-api-key" }); + const CompanySchema = z.object({ - company_name: z.string().describe("The company name"), + companyName: z.string().describe("The company name"), description: z.string().describe("Company description"), website: z.string().url().describe("Company website URL"), industry: z.string().describe("Industry sector"), }); -const apiKey = "your-api-key"; -const response = await smartScraper(apiKey, { - website_url: "https://scrapegraphai.com", - user_prompt: "Extract company information", - output_schema: CompanySchema, -}); -console.log(response.data.result); +const { data } = await sgai.extract( + "https://scrapegraphai.com", + { + prompt: "Extract company information", + schema: CompanySchema, + } +); +console.log(data); ``` --- diff --git a/integrations/langchain.mdx b/integrations/langchain.mdx index aed504f..09d5def 100644 --- a/integrations/langchain.mdx +++ b/integrations/langchain.mdx @@ -25,20 +25,20 @@ pip install langchain-scrapegraph ## Available Tools -### SmartScraperTool +### ExtractTool Extract structured data from any webpage using natural language prompts: ```python -from langchain_scrapegraph.tools import SmartScraperTool +from langchain_scrapegraph.tools import ExtractTool # Initialize the tool (uses SGAI_API_KEY from environment) -tool = SmartscraperTool() +tool = ExtractTool() # Extract information using natural language result = tool.invoke({ - "website_url": "https://www.example.com", - "user_prompt": "Extract the main heading and first paragraph" + "url": "https://www.example.com", + "prompt": "Extract the main heading and first paragraph" }) ``` @@ -46,60 +46,51 @@ result = tool.invoke({ Define the structure of the output using Pydantic models: ```python -from typing import List from pydantic import BaseModel, Field -from langchain_scrapegraph.tools import SmartScraperTool +from langchain_scrapegraph.tools import ExtractTool class WebsiteInfo(BaseModel): - title: str = Field(description="The main title of the webpage") - description: str = Field(description="The main description or first paragraph") - urls: List[str] = Field(description="The URLs inside the webpage") + title: str = Field(description="The main title of the page") + description: str = Field(description="The main description") -# Initialize with schema -tool = SmartScraperTool(llm_output_schema=WebsiteInfo) +# Initialize with output schema +tool = ExtractTool(llm_output_schema=WebsiteInfo) result = tool.invoke({ - "website_url": "https://www.example.com", - "user_prompt": "Extract the website information" + "url": "https://example.com", + "prompt": "Extract the title and description" }) ``` -### SearchScraperTool +### SearchTool -Process HTML content directly with AI extraction: +Search the web and extract structured results using AI: ```python -from langchain_scrapegraph.tools import SearchScraperTool +from langchain_scrapegraph.tools import SearchTool - -tool = SearchScraperTool() +tool = SearchTool() result = tool.invoke({ - "user_prompt": "Find the best restaurants in San Francisco", + "query": "Find the best restaurants in San Francisco", }) - ``` - -```python -from typing import Optional -from pydantic import BaseModel, Field -from langchain_scrapegraph.tools import SearchScraperTool +### ScrapeTool -class RestaurantInfo(BaseModel): - name: str = Field(description="The restaurant name") - address: str = Field(description="The restaurant address") - rating: float = Field(description="The restaurant rating") +Scrape a webpage and return it in the desired format: +```python +from langchain_scrapegraph.tools import ScrapeTool -tool = SearchScraperTool(llm_output_schema=RestaurantInfo) +tool = ScrapeTool() -result = tool.invoke({ - "user_prompt": "Find the best restaurants in San Francisco" -}) +# Scrape as markdown (default) +result = tool.invoke({"url": "https://example.com"}) +# Scrape as HTML +result = tool.invoke({"url": "https://example.com", "format": "html"}) ``` - ### MarkdownifyTool @@ -112,34 +103,146 @@ tool = MarkdownifyTool() markdown = tool.invoke({"website_url": "https://example.com"}) ``` +### Crawl Tools + +Start and manage crawl jobs with `CrawlStartTool`, `CrawlStatusTool`, `CrawlStopTool`, and `CrawlResumeTool`: + +```python +import time +from langchain_scrapegraph.tools import CrawlStartTool, CrawlStatusTool + +start_tool = CrawlStartTool() +status_tool = CrawlStatusTool() + +# Start a crawl job +result = start_tool.invoke({ + "url": "https://example.com", + "depth": 2, + "max_pages": 5, + "format": "markdown", +}) +print("Crawl started:", result) + +# Check status +crawl_id = result.get("id") +if crawl_id: + time.sleep(5) + status = status_tool.invoke({"crawl_id": crawl_id}) + print("Crawl status:", status) +``` + +### Monitor Tools + +Create and manage monitors (replaces scheduled jobs) with `MonitorCreateTool`, `MonitorListTool`, `MonitorGetTool`, `MonitorPauseTool`, `MonitorResumeTool`, and `MonitorDeleteTool`: + +```python +from langchain_scrapegraph.tools import MonitorCreateTool, MonitorListTool + +create_tool = MonitorCreateTool() +list_tool = MonitorListTool() + +# Create a monitor +result = create_tool.invoke({ + "name": "Price Monitor", + "url": "https://example.com/products", + "prompt": "Extract current product prices", + "cron": "0 9 * * *", # Daily at 9 AM +}) +print("Monitor created:", result) + +# List all monitors +monitors = list_tool.invoke({}) +print("All monitors:", monitors) +``` + +### HistoryTool + +Retrieve request history: + +```python +from langchain_scrapegraph.tools import HistoryTool + +tool = HistoryTool() +history = tool.invoke({}) +``` + +### GetCreditsTool + +Check your remaining API credits: + +```python +from langchain_scrapegraph.tools import GetCreditsTool + +tool = GetCreditsTool() +credits = tool.invoke({}) +``` + ## Example Agent Create a research agent that can gather and analyze web data: ```python -from langchain.agents import initialize_agent, AgentType -from langchain_scrapegraph.tools import SmartScraperTool +from langchain.agents import AgentExecutor, create_openai_functions_agent +from langchain_core.messages import SystemMessage +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_openai import ChatOpenAI +from langchain_scrapegraph.tools import ExtractTool, GetCreditsTool, SearchTool -# Initialize tools +# Initialize the tools tools = [ - SmartScraperTool(), + ExtractTool(), + GetCreditsTool(), + SearchTool(), ] -# Create an agent -agent = initialize_agent( - tools=tools, - llm=ChatOpenAI(temperature=0), - agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, - verbose=True -) - -# Use the agent -response = agent.run(""" - Visit example.com, make a summary of the content and extract the main heading and first paragraph -""") +# Create the prompt template +prompt = ChatPromptTemplate.from_messages([ + SystemMessage( + content=( + "You are a helpful AI assistant that can analyze websites and extract information. " + "You have access to tools that can help you scrape and process web content. " + "Always explain what you're doing before using a tool." + ) + ), + MessagesPlaceholder(variable_name="chat_history", optional=True), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), +]) + +# Initialize the LLM +llm = ChatOpenAI(temperature=0) + +# Create the agent +agent = create_openai_functions_agent(llm, tools, prompt) +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + +# Example usage +response = agent_executor.invoke({ + "input": "Extract the main products from https://www.scrapegraphai.com/" +}) +print(response["output"]) ``` +## Migration from v1 + +If you're upgrading from v1, here are the key changes: + +| v1 Tool | v2 Tool | +|---------|---------| +| `SmartScraperTool` | `ExtractTool` | +| `SearchScraperTool` | `SearchTool` | +| `SmartCrawlerTool` | `CrawlStartTool` / `CrawlStatusTool` / `CrawlStopTool` / `CrawlResumeTool` | +| `CreateScheduledJobTool` | `MonitorCreateTool` | +| `GetScheduledJobsTool` | `MonitorListTool` | +| `GetScheduledJobTool` | `MonitorGetTool` | +| `PauseScheduledJobTool` | `MonitorPauseTool` | +| `ResumeScheduledJobTool` | `MonitorResumeTool` | +| `DeleteScheduledJobTool` | `MonitorDeleteTool` | +| `MarkdownifyTool` | `MarkdownifyTool` (unchanged) | +| `GetCreditsTool` | `GetCreditsTool` (unchanged) | +| `AgenticScraperTool` | Removed | +| -- | `HistoryTool` (new) | + ## Configuration Set your ScrapeGraph API key in your environment: diff --git a/integrations/vercel_ai.mdx b/integrations/vercel_ai.mdx index 889df6b..f57a290 100644 --- a/integrations/vercel_ai.mdx +++ b/integrations/vercel_ai.mdx @@ -5,19 +5,19 @@ description: "Integrate ScrapeGraphAI into Vercel AI" ## Overview -[Vercel AI sdk](https://ai-sdk.dev/) is a very populate javascript/typescript framework to interact with various LLMs providers. This page shows how to integrate it with ScrapeGraph +[Vercel AI SDK](https://ai-sdk.dev/) is a popular JavaScript/TypeScript framework to interact with various LLM providers. This page shows how to integrate it with ScrapeGraph. - View the integration on LlamaHub + View the Vercel AI SDK documentation ## Installation -Follow out [javascript sdk installation steps](/sdks/javascript) using your favourite package manager: +Follow our [JavaScript SDK installation steps](/sdks/javascript) using your favourite package manager: ```bash # Using npm @@ -33,7 +33,7 @@ yarn add scrapegraph-js bun add scrapegraph-js ``` -Then, install [vercel ai](https://ai-sdk.dev/docs/getting-started) with their [openai provider](https://ai-sdk.dev/providers/ai-sdk-providers/openai) +Then, install [Vercel AI](https://ai-sdk.dev/docs/getting-started) with their [OpenAI provider](https://ai-sdk.dev/providers/ai-sdk-providers/openai): ```bash # Using npm @@ -51,15 +51,15 @@ bun add ai @ai-sdk/openai ## Usage -ScrapeGraph sdk can be used like any other tools, see [vercel ai tool calling doc](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling) +The ScrapeGraph SDK can be used like any other tool. See [Vercel AI tool calling docs](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling). ```ts import { z } from "zod"; import { generateText, tool } from "ai"; import { openai } from "@ai-sdk/openai"; -import { smartScraper } from "scrapegraph-js"; +import { scrapegraphai } from "scrapegraph-js"; -const apiKey = process.env.SGAI_APIKEY; +const sgai = scrapegraphai({ apiKey: process.env.SGAI_API_KEY }); const ArticleSchema = z.object({ title: z.string().describe("The article title"), @@ -77,15 +77,14 @@ const result = await generateText({ model: openai("gpt-4.1-mini"), tools: { scrape: tool({ - description: "Get articles information for a given url.", + description: "Extract articles information from a given URL.", parameters: z.object({ - url: z.string().describe("The exact url."), + url: z.string().describe("The exact URL."), }), execute: async ({ url }) => { - const response = await smartScraper(apiKey, { - website_url: url, - user_prompt: "Extract the article information", - output_schema: ArticlesArraySchema, + const response = await sgai.extract(url, { + prompt: "Extract the article information", + schema: ArticlesArraySchema, }); return response.data; }, @@ -97,8 +96,6 @@ const result = await generateText({ console.log(result); ``` -**TODO ADD THE LOGS** - ## Support Need help with the integration? @@ -107,7 +104,7 @@ Need help with the integration? Report bugs and request features diff --git a/sdks/javascript.mdx b/sdks/javascript.mdx index ed4fb55..ea1c726 100644 --- a/sdks/javascript.mdx +++ b/sdks/javascript.mdx @@ -1,6 +1,6 @@ --- title: "JavaScript SDK" -description: "Official JavaScript/TypeScript SDK for ScrapeGraphAI" +description: "Official JavaScript/TypeScript SDK for ScrapeGraphAI v2" icon: "js" --- @@ -22,8 +22,6 @@ icon: "js" ## Installation -Install the package using npm, pnpm, yarn or bun: - ```bash # Using npm npm i scrapegraph-js @@ -38,82 +36,77 @@ yarn add scrapegraph-js bun add scrapegraph-js ``` -## Features + +v2 requires **Node.js >= 22**. + -- **AI-Powered Extraction**: Smart web scraping with artificial intelligence -- **Async by Design**: Fully asynchronous architecture -- **Type Safety**: Built-in TypeScript support with Zod schemas -- **Zero Exceptions**: All errors wrapped in `ApiResult` — no try/catch needed -- **Developer Friendly**: Comprehensive error handling and debug logging +## What's New in v2 -## Quick Start +- **Factory pattern**: Create a client with `scrapegraphai({ apiKey })` instead of importing individual functions +- **Renamed methods**: `smartScraper` → `extract`, `searchScraper` → `search` +- **camelCase parameters**: All params are now camelCase (e.g., `fetchConfig` instead of `fetch_config`) +- **Throws on error**: Methods return `{ data, requestId }` and throw on failure (no more `ApiResult` wrapper) +- **Native Zod support**: Pass Zod schemas directly to `schema` parameter +- **Namespace methods**: `crawl.start()`, `monitor.create()`, etc. +- **Removed**: `agenticScraper`, `generateSchema`, `sitemap`, `checkHealth`, `markdownify` -### Basic example + +v2 is a breaking release. If you're upgrading from v1, see the [Migration Guide](https://github.com/ScrapeGraphAI/scrapegraph-js/blob/main/MIGRATION.md). + - - Store your API keys securely in environment variables. Use `.env` files and - libraries like `dotenv` to load them into your app. - +## Quick Start ```javascript -import { smartScraper } from "scrapegraph-js"; -import "dotenv/config"; +import scrapegraphai from "scrapegraph-js"; -const apiKey = process.env.SGAI_APIKEY; +const sgai = scrapegraphai({ apiKey: "your-api-key" }); -const response = await smartScraper(apiKey, { - website_url: "https://example.com", - user_prompt: "What does the company do?", -}); +const { data, requestId } = await sgai.extract( + "https://example.com", + { prompt: "What does the company do?" } +); -if (response.status === "error") { - console.error("Error:", response.error); -} else { - console.log(response.data.result); -} +console.log(data); ``` + +Store your API keys securely in environment variables. Use `.env` files and +libraries like `dotenv` to load them into your app. + + +### Client Options + +| Parameter | Type | Default | Description | +| ---------- | ------ | -------------------------------- | ------------------------------- | +| apiKey | string | Required | Your ScrapeGraphAI API key | +| baseUrl | string | `https://api.scrapegraphai.com` | API base URL | +| timeout | number | `30000` | Request timeout in ms | +| maxRetries | number | `3` | Maximum number of retries | + ## Services -### SmartScraper +### extract() -Extract specific information from any webpage using AI: +Extract structured data from any webpage using AI. Replaces the v1 `smartScraper` function. ```javascript -const response = await smartScraper(apiKey, { - website_url: "https://example.com", - user_prompt: "Extract the main content", -}); -``` - -All functions return an `ApiResult` object: -```typescript -type ApiResult = { - status: "success" | "error"; - data: T | null; - error?: string; - elapsedMs: number; -}; +const { data, requestId } = await sgai.extract( + "https://example.com", + { prompt: "Extract the main heading and description" } +); ``` #### Parameters -| Parameter | Type | Required | Description | -| --------------- | ------- | -------- | ----------------------------------------------------------------------------------- | -| apiKey | string | Yes | The ScrapeGraph API Key (first argument). | -| user_prompt | string | Yes | A textual description of what you want to extract. | -| website_url | string | No* | The URL of the webpage to scrape. *One of `website_url`, `website_html`, or `website_markdown` is required. | -| output_schema | object | No | A Zod schema (converted to JSON) that describes the structure of the response. | -| number_of_scrolls | number | No | Number of scrolls for infinite scroll pages (0-50). | -| stealth | boolean | No | Enable anti-detection mode (+4 credits). | -| headers | object | No | Custom HTTP headers. | -| mock | boolean | No | Enable mock mode for testing. | -| wait_ms | number | No | Page load wait time in ms (default: 3000). | -| country_code | string | No | Proxy routing country code (e.g., "us"). | - - -Define a simple schema using Zod: +| Parameter | Type | Required | Description | +| -------------------- | ----------- | -------- | -------------------------------------------------------- | +| url | string | Yes | The URL of the webpage to scrape | +| options.prompt | string | Yes | A description of what you want to extract | +| options.schema | ZodSchema / object | No | Zod schema or JSON schema for structured response | +| options.fetchConfig | FetchConfig | No | Fetch configuration | +| options.llmConfig | LlmConfig | No | LLM configuration | + ```javascript import { z } from "zod"; @@ -122,301 +115,216 @@ const ArticleSchema = z.object({ author: z.string().describe("The author's name"), publishDate: z.string().describe("Article publication date"), content: z.string().describe("Main article content"), - category: z.string().describe("Article category"), }); -const ArticlesArraySchema = z - .array(ArticleSchema) - .describe("Array of articles"); +const { data } = await sgai.extract( + "https://example.com/blog/article", + { + prompt: "Extract the article information", + schema: ArticleSchema, + } +); -const response = await smartScraper(apiKey, { - website_url: "https://example.com/blog/article", - user_prompt: "Extract the article information", - output_schema: ArticlesArraySchema, -}); - -console.log(`Title: ${response.data.result.title}`); -console.log(`Author: ${response.data.result.author}`); -console.log(`Published: ${response.data.result.publishDate}`); +console.log(`Title: ${data.title}`); +console.log(`Author: ${data.author}`); ``` - - -Define a complex schema for nested data structures: - + ```javascript -import { z } from "zod"; - -const EmployeeSchema = z.object({ - name: z.string().describe("Employee's full name"), - position: z.string().describe("Job title"), - department: z.string().describe("Department name"), - email: z.string().describe("Email address"), -}); - -const OfficeSchema = z.object({ - location: z.string().describe("Office location/city"), - address: z.string().describe("Full address"), - phone: z.string().describe("Contact number"), -}); - -const CompanySchema = z.object({ - name: z.string().describe("Company name"), - description: z.string().describe("Company description"), - industry: z.string().describe("Industry sector"), - foundedYear: z.number().describe("Year company was founded"), - employees: z.array(EmployeeSchema).describe("List of key employees"), - offices: z.array(OfficeSchema).describe("Company office locations"), - website: z.string().url().describe("Company website URL"), -}); +const { data } = await sgai.extract( + "https://example.com", + { + prompt: "Extract the main heading", + fetchConfig: { + stealth: true, + renderJs: true, + waitMs: 2000, + scrolls: 3, + }, + llmConfig: { + temperature: 0.3, + maxTokens: 1000, + }, + } +); +``` + -const response = await smartScraper(apiKey, { - website_url: "https://example.com/about", - user_prompt: "Extract detailed company information including employees and offices", - output_schema: CompanySchema, -}); +### search() -console.log(`Company: ${response.data.result.name}`); -console.log("\nKey Employees:"); -response.data.result.employees.forEach((employee) => { - console.log(`- ${employee.name} (${employee.position})`); -}); +Search the web and extract information. Replaces the v1 `searchScraper` function. -console.log("\nOffice Locations:"); -response.data.result.offices.forEach((office) => { - console.log(`- ${office.location}: ${office.address}`); -}); +```javascript +const { data } = await sgai.search( + "What are the key features and pricing of ChatGPT Plus?", + { numResults: 5 } +); ``` - +#### Parameters - -For modern web applications built with React, Vue, Angular, or other JavaScript frameworks: +| Parameter | Type | Required | Description | +| -------------------- | ----------- | -------- | -------------------------------------------------------- | +| query | string | Yes | The search query | +| options.numResults | number | No | Number of results (3-20). Default: 3 | +| options.schema | ZodSchema / object | No | Schema for structured response | +| options.fetchConfig | FetchConfig | No | Fetch configuration | +| options.llmConfig | LlmConfig | No | LLM configuration | + ```javascript -import { smartScraper } from 'scrapegraph-js'; -import { z } from 'zod'; - -const apiKey = 'your-api-key'; +import { z } from "zod"; const ProductSchema = z.object({ - name: z.string().describe('Product name'), - price: z.string().describe('Product price'), - description: z.string().describe('Product description'), - availability: z.string().describe('Product availability status') + name: z.string().describe("Product name"), + price: z.string().describe("Product price"), + features: z.array(z.string()).describe("Key features"), }); -const response = await smartScraper(apiKey, { - website_url: 'https://example-react-store.com/products/123', - user_prompt: 'Extract product details including name, price, description, and availability', - output_schema: ProductSchema, -}); +const { data } = await sgai.search( + "Find information about iPhone 15 Pro", + { + schema: ProductSchema, + numResults: 5, + } +); -if (response.status === 'error') { - console.error('Error:', response.error); -} else { - console.log('Product:', response.data.result.name); - console.log('Price:', response.data.result.price); - console.log('Available:', response.data.result.availability); -} +console.log(`Product: ${data.name}`); +console.log(`Price: ${data.price}`); ``` - -### SearchScraper +### scrape() -Search and extract information from multiple web sources using AI: +Convert any webpage to markdown, HTML, screenshot, or branding format. ```javascript -const response = await searchScraper(apiKey, { - user_prompt: "Find the best restaurants in San Francisco", - location_geo_code: "us", - time_range: "past_week", -}); +const { data } = await sgai.scrape("https://example.com"); +console.log(data); ``` #### Parameters -| Parameter | Type | Required | Description | -| ------------------ | ------- | -------- | ---------------------------------------------------------------------------------- | -| apiKey | string | Yes | The ScrapeGraph API Key (first argument). | -| user_prompt | string | Yes | A textual description of what you want to achieve. | -| num_results | number | No | Number of websites to search (3-20). Default: 3. | -| extraction_mode | boolean | No | **true** = AI extraction mode (10 credits/page), **false** = markdown mode (2 credits/page). | -| output_schema | object | No | Zod schema for structured response format (AI extraction mode only). | -| location_geo_code | string | No | Geo code for location-based search (e.g., "us"). | -| time_range | string | No | Time range filter. Options: "past_hour", "past_24_hours", "past_week", "past_month", "past_year". | +| Parameter | Type | Required | Description | +| -------------------- | ----------- | -------- | -------------------------------------------------------- | +| url | string | Yes | The URL of the webpage to scrape | +| options.format | string | No | `"markdown"`, `"html"`, `"screenshot"`, `"branding"` | +| options.fetchConfig | FetchConfig | No | Fetch configuration | - -Define a simple schema using Zod: +### crawl -```javascript -import { z } from "zod"; +Manage multi-page crawl operations asynchronously. -const ArticleSchema = z.object({ - title: z.string().describe("The article title"), - author: z.string().describe("The author's name"), - publishDate: z.string().describe("Article publication date"), - content: z.string().describe("Main article content"), - category: z.string().describe("Article category"), +```javascript +// Start a crawl +const job = await sgai.crawl.start("https://example.com", { + depth: 2, + includePatterns: ["/blog/*", "/docs/**"], + excludePatterns: ["/admin/*", "/api/*"], }); +console.log(`Crawl started: ${job.data.id}`); -const response = await searchScraper(apiKey, { - user_prompt: "Find news about the latest trends in AI", - output_schema: ArticleSchema, - location_geo_code: "us", - time_range: "past_week", -}); +// Check status +const status = await sgai.crawl.status(job.data.id); +console.log(`Status: ${status.data.status}`); -console.log(`Title: ${response.data.result.title}`); -console.log(`Author: ${response.data.result.author}`); -console.log(`Published: ${response.data.result.publishDate}`); +// Stop / Resume +await sgai.crawl.stop(job.data.id); +await sgai.crawl.resume(job.data.id); ``` - +### monitor - -Define a complex schema for nested data structures: +Create and manage site monitoring jobs. ```javascript -import { z } from "zod"; - -const EmployeeSchema = z.object({ - name: z.string().describe("Employee's full name"), - position: z.string().describe("Job title"), - department: z.string().describe("Department name"), - email: z.string().describe("Email address"), +// Create a monitor +const monitor = await sgai.monitor.create({ + url: "https://example.com", + prompt: "Track price changes", + schedule: "daily", }); -const OfficeSchema = z.object({ - location: z.string().describe("Office location/city"), - address: z.string().describe("Full address"), - phone: z.string().describe("Contact number"), -}); - -const RestaurantSchema = z.object({ - name: z.string().describe("Restaurant name"), - address: z.string().describe("Restaurant address"), - rating: z.number().describe("Restaurant rating"), - website: z.string().url().describe("Restaurant website URL"), -}); +// List all monitors +const monitors = await sgai.monitor.list(); -const response = await searchScraper(apiKey, { - user_prompt: "Find the best restaurants in San Francisco", - output_schema: RestaurantSchema, - location_geo_code: "us", - time_range: "past_month", -}); +// Get / Pause / Resume / Delete +const details = await sgai.monitor.get(monitor.data.id); +await sgai.monitor.pause(monitor.data.id); +await sgai.monitor.resume(monitor.data.id); +await sgai.monitor.delete(monitor.data.id); ``` - +### credits() - -Use markdown mode for cost-effective content gathering: +Check your account credit balance. ```javascript -import { searchScraper } from 'scrapegraph-js'; +const { data } = await sgai.credits(); +console.log(`Remaining: ${data.remainingCredits}`); +console.log(`Used: ${data.totalCreditsUsed}`); +``` -const apiKey = 'your-api-key'; +### history() -const response = await searchScraper(apiKey, { - user_prompt: 'Latest developments in artificial intelligence', - num_results: 3, - extraction_mode: false, - location_geo_code: "us", - time_range: "past_week", +Retrieve paginated request history. + +```javascript +const { data } = await sgai.history({ + page: 1, + perPage: 20, + service: "extract", }); -if (response.status === 'error') { - console.error('Error:', response.error); -} else { - const markdownContent = response.data.markdown_content; - console.log('Markdown content length:', markdownContent.length); - console.log('Reference URLs:', response.data.reference_urls); - console.log('Content preview:', markdownContent.substring(0, 500) + '...'); -} +data.items.forEach((entry) => { + console.log(`${entry.createdAt} - ${entry.service} - ${entry.status}`); +}); ``` -**Markdown Mode Benefits:** -- **Cost-effective**: Only 2 credits per page (vs 10 credits for AI extraction) -- **Full content**: Get complete page content in markdown format -- **Faster**: No AI processing overhead -- **Perfect for**: Content analysis, bulk data collection, building datasets +## Configuration Objects - +### FetchConfig - -Filter search results by date range to get only recent information: +Controls how pages are fetched. ```javascript -import { searchScraper } from 'scrapegraph-js'; - -const apiKey = 'your-api-key'; - -const response = await searchScraper(apiKey, { - user_prompt: 'Latest news about AI developments', - num_results: 5, - time_range: 'past_week', // Options: 'past_hour', 'past_24_hours', 'past_week', 'past_month', 'past_year' -}); - -if (response.status === 'error') { - console.error('Error:', response.error); -} else { - console.log('Recent AI news:', response.data.result); - console.log('Reference URLs:', response.data.reference_urls); +{ + stealth: true, // Anti-detection mode + renderJs: true, // Render JavaScript + waitMs: 2000, // Wait time after page load (ms) + scrolls: 3, // Number of scrolls + country: "us", // Proxy country code + cookies: { key: "value" }, + headers: { "X-Custom": "header" }, } ``` -**Time Range Options:** -- `past_hour` - Results from the past hour -- `past_24_hours` - Results from the past 24 hours -- `past_week` - Results from the past week -- `past_month` - Results from the past month -- `past_year` - Results from the past year - -**Use Cases:** -- Finding recent news and updates -- Tracking time-sensitive information -- Getting latest product releases -- Monitoring recent market changes - - - -### Markdownify +### LlmConfig -Convert any webpage into clean, formatted markdown: +Controls LLM behavior for AI-powered methods. ```javascript -const response = await markdownify(apiKey, { - website_url: "https://example.com", -}); +{ + model: "default", // LLM model to use + temperature: 0.3, // Response creativity (0-1) + maxTokens: 1000, // Maximum response tokens +} ``` -#### Parameters - -| Parameter | Type | Required | Description | -| ----------- | ------- | -------- | ---------------------------------------------- | -| apiKey | string | Yes | The ScrapeGraph API Key (first argument). | -| website_url | string | Yes | The URL of the webpage to convert to markdown. | -| wait_ms | number | No | Page load wait time in ms (default: 3000). | -| stealth | boolean | No | Enable anti-detection mode (+4 credits). | -| country_code| string | No | Proxy routing country code (e.g., "us"). | +## Error Handling -## API Credits - -Check your available API credits: +v2 throws errors instead of returning `ApiResult`. Use try/catch: ```javascript -import { getCredits } from "scrapegraph-js"; - -const credits = await getCredits(apiKey); - -if (credits.status === "error") { - console.error("Error fetching credits:", credits.error); -} else { - console.log("Remaining credits:", credits.data.remaining_credits); - console.log("Total used:", credits.data.total_credits_used); +try { + const { data, requestId } = await sgai.extract( + "https://example.com", + { prompt: "Extract the title" } + ); + console.log(data); +} catch (err) { + console.error(`Request failed: ${err.message}`); } ``` @@ -438,9 +346,3 @@ if (credits.status === "error") { Get help from our development team - - - This project is licensed under the MIT License. See the - [LICENSE](https://github.com/ScrapeGraphAI/scrapegraph-js/blob/main/LICENSE) - file for details. - diff --git a/sdks/mocking.mdx b/sdks/mocking.mdx index 592de50..a1a0f58 100644 --- a/sdks/mocking.mdx +++ b/sdks/mocking.mdx @@ -1,6 +1,6 @@ --- title: 'Mocking & Testing' -description: 'Test ScrapeGraphAI functionality in an isolated environment without consuming API credits' +description: 'Test ScrapeGraphAI v2 functionality without consuming API credits' icon: 'test-tube' --- @@ -11,584 +11,261 @@ icon: 'test-tube' /> - - Test your code without making real API calls + + Use familiar testing tools for mocking - - Override responses for specific endpoints + + Test without consuming API credits ## Overview -A mock environment is an isolated test environment. You can use mock mode to test ScrapeGraphAI functionality in your application, and experiment with new features without affecting your live integration or consuming API credits. For example, when testing in mock mode, the scraping requests you create aren't processed by our servers or counted against your credit usage. +In v2, the built-in mock mode (`mock=True`, `mock_handler`, `mock_responses`) has been removed from the SDKs. Instead, use standard mocking libraries for your language to test ScrapeGraphAI integrations without making real API calls or consuming credits. -## Use cases + +If you're migrating from v1, replace `Client(mock=True)` with standard mocking patterns shown below. + -Mock mode provides an environment for testing various functionalities and scenarios without the implications of real API calls. Below are some common use cases for mocking in your ScrapeGraphAI integrations: +## Python SDK Testing -| Scenario | Description | -|----------|-------------| -| **Simulate scraping responses to test without real API calls** | Use mock mode to test scraping functionality without real API calls. Create mock responses in your application to test data processing logic or use custom handlers to simulate various response scenarios. | -| **Scale isolated testing for teams** | Your team can test in separate mock environments to make sure that data and actions are completely isolated from other tests. Changes made in one mock configuration don't interfere with changes in another. | -| **Test without API key requirements** | You can test your integration without providing real API keys, making it easier for external developers, implementation partners, or design agencies to work with your code without access to your live API credentials. | -| **Test in development or CI/CD pipelines** | Access mock mode from your development environment or continuous integration pipelines. Test ScrapeGraphAI functionality directly in your code or use familiar testing frameworks and fixtures. | - -## Test in mock mode - -You can simulate scraping responses and use mock data to test your integration without consuming API credits. Learn more about using mock responses to confirm that your integration works correctly. - -## Basic Mock Usage - -Enable mock mode by setting `mock=True` when initializing the client: +### Using `unittest.mock` ```python +from unittest.mock import patch, MagicMock from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -# Set logging level for better visibility -sgai_logger.set_logging(level="INFO") - -def basic_mock_usage(): - # Initialize the client with mock mode enabled - client = Client.from_env(mock=True) - - print("\n-- get_credits (mock) --") - print(client.get_credits()) - - print("\n-- markdownify (mock) --") - md = client.markdownify(website_url="https://example.com") - print(md) - - print("\n-- get_markdownify (mock) --") - md_status = client.get_markdownify("00000000-0000-0000-0000-000000000123") - print(md_status) - - print("\n-- smartscraper (mock) --") - ss = client.smartscraper(user_prompt="Extract title", website_url="https://example.com") - print(ss) - -if __name__ == "__main__": - basic_mock_usage() -``` - - -When mock mode is enabled, all API calls return predefined mock responses instead of making real HTTP requests. This ensures your tests run quickly and don't consume API credits. - - -## Custom Response Overrides -You can override specific endpoint responses using the `mock_responses` parameter: +def test_extract(): + client = Client(api_key="test-key") -```python -def mock_with_path_overrides(): - # Initialize the client with mock mode and custom responses - client = Client.from_env( - mock=True, - mock_responses={ - "/v1/credits": {"remaining_credits": 42, "total_credits_used": 58, "mock": true} + mock_response = { + "data": { + "title": "Test Page", + "content": "This is test content" }, - ) - - print("\n-- get_credits with override (mock) --") - print(client.get_credits()) -``` + "request_id": "test-request-123" + } - -You can override responses for any endpoint by providing the path and expected response: + with patch.object(client, "extract", return_value=mock_response): + response = client.extract( + url="https://example.com", + prompt="Extract title and content" + ) -```python -client = Client.from_env( - mock=True, - mock_responses={ - "/v1/credits": { - "remaining_credits": 100, - "total_credits_used": 0, - "mock": true - }, - "/v1/smartscraper/start": { - "job_id": "mock-job-123", - "status": "processing", - "mock": true - }, - "/v1/smartscraper/status/mock-job-123": { - "job_id": "mock-job-123", - "status": "completed", - "result": { - "title": "Mock Title", - "content": "Mock content from the webpage", - "mock": true - } - }, - "/v1/markdownify/start": { - "job_id": "mock-markdown-456", - "status": "processing", - "mock": true - }, - "/v1/markdownify/status/mock-markdown-456": { - "job_id": "mock-markdown-456", - "status": "completed", - "result": "# Mock Markdown\n\nThis is mock markdown content.", - "mock": true - } - } -) + assert response["data"]["title"] == "Test Page" + assert response["request_id"] == "test-request-123" ``` - -## Custom Handler Functions +### Using `responses` Library -For more complex mocking scenarios, you can provide a custom handler function: +Mock HTTP requests at the transport layer: ```python -def mock_with_custom_handler(): - def handler(method, url, kwargs): - return {"handled_by": "custom_handler", "method": method, "url": url} - - # Initialize the client with mock mode and custom handler - client = Client.from_env(mock=True, mock_handler=handler) +import responses +from scrapegraph_py import Client - print("\n-- searchscraper via custom handler (mock) --") - resp = client.searchscraper(user_prompt="Search something") - print(resp) -``` +@responses.activate +def test_extract_http(): + responses.post( + "https://api.scrapegraphai.com/api/v2/extract", + json={ + "data": {"title": "Mock Title"}, + "request_id": "mock-123" + }, + status=200, + ) - -Create sophisticated mock responses based on request parameters: + client = Client(api_key="test-key") + response = client.extract( + url="https://example.com", + prompt="Extract the title" + ) -```python -def advanced_custom_handler(): - def smart_handler(method, url, kwargs): - # Handle different endpoints with custom logic - if "/v1/credits" in url: - return { - "remaining_credits": 50, - "total_credits_used": 50, - "mock": true - } - elif "/v1/smartscraper" in url: - # Extract user_prompt from kwargs to create contextual responses - user_prompt = kwargs.get("user_prompt", "") - if "title" in user_prompt.lower(): - return { - "job_id": "mock-title-job", - "status": "completed", - "result": { - "title": "Extracted Title", - "content": "This is the extracted content", - "mock": true - } - } - else: - return { - "job_id": "mock-generic-job", - "status": "completed", - "result": { - "data": "Generic extracted data", - "mock": true - } - } - else: - return {"error": "Unknown endpoint", "url": url} - - client = Client.from_env(mock=True, mock_handler=smart_handler) - - # Test different scenarios - print("Credits:", client.get_credits()) - print("Title extraction:", client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the title" - )) - print("Generic extraction:", client.smartscraper( - website_url="https://example.com", - user_prompt="Extract some data" - )) + assert response["data"]["title"] == "Mock Title" ``` - - -## Testing Best Practices -### Unit Testing with Mocks +### Using `pytest` Fixtures ```python -import unittest -from unittest.mock import patch +import pytest +from unittest.mock import MagicMock from scrapegraph_py import Client -class TestScrapeGraphAI(unittest.TestCase): - def setUp(self): - self.client = Client.from_env(mock=True) - - def test_get_credits(self): - credits = self.client.get_credits() - self.assertIn("remaining_credits", credits) - self.assertIn("total_credits_used", credits) - - def test_smartscraper_with_schema(self): - from pydantic import BaseModel, Field - - class TestSchema(BaseModel): - title: str = Field(description="Page title") - content: str = Field(description="Page content") - - response = self.client.smartscraper( - website_url="https://example.com", - user_prompt="Extract title and content", - output_schema=TestSchema - ) - - self.assertIsInstance(response, TestSchema) - self.assertIsNotNone(response.title) - self.assertIsNotNone(response.content) - -if __name__ == "__main__": - unittest.main() -``` - -### Integration Testing - -```python -def test_integration_flow(): - """Test a complete workflow using mocks""" - client = Client.from_env( - mock=True, - mock_responses={ - "/v1/credits": {"remaining_credits": 10, "total_credits_used": 90, "mock": true}, - "/v1/smartscraper/start": { - "job_id": "test-job-123", - "status": "processing", - "mock": true - }, - "/v1/smartscraper/status/test-job-123": { - "job_id": "test-job-123", - "status": "completed", - "result": { - "title": "Test Page", - "content": "Test content", - "mock": true - } - } - } +@pytest.fixture +def mock_client(): + client = Client(api_key="test-key") + client.extract = MagicMock(return_value={ + "data": {"title": "Mock Title"}, + "request_id": "mock-123" + }) + client.search = MagicMock(return_value={ + "data": {"results": []}, + "request_id": "mock-456" + }) + client.credits = MagicMock(return_value={ + "remaining_credits": 100, + "total_credits_used": 0 + }) + return client + +def test_extract(mock_client): + response = mock_client.extract( + url="https://example.com", + prompt="Extract the title" ) - - # Test the complete flow - credits = client.get_credits() - assert credits["remaining_credits"] == 10 - - # Start a scraping job - job = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract title and content" - ) - - # Check job status - status = client.get_smartscraper("test-job-123") - assert status["status"] == "completed" - assert "title" in status["result"] -``` - -## Environment Variables - -You can also control mocking through environment variables: + assert response["data"]["title"] == "Mock Title" -```bash -# Enable mock mode via environment variable -export SGAI_MOCK=true - -# Set custom mock responses (JSON format) -export SGAI_MOCK_RESPONSES='{"\/v1\/credits": {"remaining_credits": 100, "mock": true}}' +def test_credits(mock_client): + credits = mock_client.credits() + assert credits["remaining_credits"] == 100 ``` -```python -# The client will automatically detect mock mode from environment -client = Client.from_env() # Will use mock mode if SGAI_MOCK=true -``` - -## Async Mocking - -Mocking works seamlessly with async clients: +### Async Testing with `aioresponses` ```python +import pytest import asyncio +from aioresponses import aioresponses from scrapegraph_py import AsyncClient -async def async_mock_example(): - async with AsyncClient(mock=True) as client: - # All async methods work with mocks - credits = await client.get_credits() - print(f"Mock credits: {credits}") - - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract data" +@pytest.mark.asyncio +async def test_async_extract(): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/api/v2/extract", + payload={ + "data": {"title": "Async Mock"}, + "request_id": "async-123" + }, ) - print(f"Mock response: {response}") - -# Run the async example -asyncio.run(async_mock_example()) -``` -## HTTP Method Mocking with cURL + async with AsyncClient(api_key="test-key") as client: + response = await client.extract( + url="https://example.com", + prompt="Extract data" + ) -You can also test ScrapeGraphAI endpoints directly using cURL with mock responses. This is useful for testing API integrations without using SDKs. - -### Basic cURL Mock Usage - -```bash -# Enable mock mode via environment variable -export SGAI_MOCK=true - -# Test credits endpoint with mock -curl -X GET "https://api.scrapegraph.ai/v1/credits" \ - -H "Authorization: Bearer $SGAI_API_KEY" \ - -H "Content-Type: application/json" -``` - -### Custom Mock Responses with cURL - -```bash -# Set custom mock responses via environment variable -export SGAI_MOCK_RESPONSES='{ - "/v1/credits": { - "remaining_credits": 100, - "total_credits_used": 0, - "mock": true - }, -}' - -# Test smartscraper endpoint -curl -X POST "https://api.scrapegraph.ai/v1/smartscraper/" \ - -H "Authorization: Bearer $SGAI_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "website_url": "https://example.com", - "user_prompt": "Extract title and content" - "mock": true - }' + assert response["data"]["title"] == "Async Mock" ``` -### Testing Different HTTP Methods +## JavaScript SDK Testing -```bash -# POST request - to smartscraper -curl --location 'https://api.scrapegraphai.com/v1/smartscraper' \ ---data '{ - "website_url": "https://www.scrapegraphai.com//", - "user_prompt": "Extract founder info ", - "mock":true -}' -``` +### Using Jest / Vitest -```bash -# POST request - to Markdownify -curl --location 'https://api.scrapegraphai.com/v1/markdownify' \ ---data '{ - "website_url": "https://www.scrapegraphai.com//", - "mock":true -}' -``` - -```bash -# POST request - to SearchScraper -curl --location 'https://api.scrapegraphai.com/v1/searchscraper' \ ---data '{ - "website_url": "https://www.scrapegraphai.com//", - "mock":true - "output_schema":{}, - "num_results":3, -}' +```javascript +import { describe, it, expect, vi } from "vitest"; +import scrapegraphai from "scrapegraph-js"; + +// Mock the module +vi.mock("scrapegraph-js", () => ({ + default: vi.fn(() => ({ + extract: vi.fn().mockResolvedValue({ + data: { title: "Mock Title" }, + requestId: "mock-123", + }), + search: vi.fn().mockResolvedValue({ + data: { results: [] }, + requestId: "mock-456", + }), + credits: vi.fn().mockResolvedValue({ + data: { remainingCredits: 100 }, + }), + })), +})); + +describe("ScrapeGraphAI", () => { + const sgai = scrapegraphai({ apiKey: "test-key" }); + + it("should extract data", async () => { + const { data } = await sgai.extract("https://example.com", { + prompt: "Extract the title", + }); + expect(data.title).toBe("Mock Title"); + }); + + it("should check credits", async () => { + const { data } = await sgai.credits(); + expect(data.remainingCredits).toBe(100); + }); +}); ``` +### Using MSW (Mock Service Worker) -## JavaScript SDK Mocking - -The JavaScript SDK supports per-request mocking via the `mock` parameter. Pass `mock: true` in the params object of any function to receive mock data instead of making a real API call. - -### Per-Request Mock Mode +Mock at the network level for more realistic testing: ```javascript -import { smartScraper, scrape, searchScraper, getCredits } from 'scrapegraph-js'; - -const API_KEY = 'your-api-key'; - -// SmartScraper with mock -const smartResult = await smartScraper(API_KEY, { - website_url: 'https://example.com', - user_prompt: 'Extract the title', - mock: true, -}); -console.log('SmartScraper mock:', smartResult.data); - -// Scrape with mock -const scrapeResult = await scrape(API_KEY, { - website_url: 'https://example.com', - mock: true, -}); -console.log('Scrape mock:', scrapeResult.data); - -// SearchScraper with mock -const searchResult = await searchScraper(API_KEY, { - user_prompt: 'Find AI news', - mock: true, +import { http, HttpResponse } from "msw"; +import { setupServer } from "msw/node"; +import scrapegraphai from "scrapegraph-js"; + +const server = setupServer( + http.post("https://api.scrapegraphai.com/api/v2/extract", () => { + return HttpResponse.json({ + data: { title: "MSW Mock Title" }, + requestId: "msw-123", + }); + }), + http.get("https://api.scrapegraphai.com/api/v2/credits", () => { + return HttpResponse.json({ + data: { remainingCredits: 50, totalCreditsUsed: 50 }, + }); + }) +); + +beforeAll(() => server.listen()); +afterAll(() => server.close()); +afterEach(() => server.resetHandlers()); + +test("extract returns mocked data", async () => { + const sgai = scrapegraphai({ apiKey: "test-key" }); + const { data } = await sgai.extract("https://example.com", { + prompt: "Extract the title", + }); + expect(data.title).toBe("MSW Mock Title"); }); -console.log('SearchScraper mock:', searchResult.data); ``` - -The JavaScript SDK does not have global mock functions like `enableMock()` or `setMockResponses()`. Mock mode is controlled per-request via the `mock: true` parameter. All functions return `ApiResult` — errors are never thrown. - +## Testing with cURL -## SDK Comparison - - - - - `Client(mock=True)` initialization - - `mock_responses` parameter for overrides - - `mock_handler` for custom logic - - Environment variable: `SGAI_MOCK=true` - - - - `mock: true` in per-request params - - All functions support mock parameter - - Native async/await - - - - Environment variable: `SGAI_MOCK=true` - - `SGAI_MOCK_RESPONSES` for custom responses - - Direct HTTP method testing - - No SDK dependencies required - - - -### Feature Comparison - -| Feature | Python SDK | JavaScript SDK | cURL/HTTP | -|---------|------------|----------------|-----------| -| **Global Mock Mode** | `Client(mock=True)` | N/A | `SGAI_MOCK=true` | -| **Per-Request Mock** | `{mock: True}` in params | `mock: true` in params | N/A | -| **Custom Responses** | `mock_responses` dict | N/A | `SGAI_MOCK_RESPONSES` | -| **Custom Handler** | `mock_handler` function | N/A | N/A | -| **Environment Variable** | `SGAI_MOCK=true` | N/A | `SGAI_MOCK=true` | -| **Async Support** | `AsyncClient(mock=True)` | Native async/await | N/A | -| **Dependencies** | Python SDK required | JavaScript SDK required | None | - -## Limitations - -* You can't test real-time scraping performance in mock mode. -* Mock responses don't reflect actual website changes or dynamic content. -* Rate limiting and credit consumption are not simulated in mock mode. -* Some advanced features may behave differently in mock mode compared to live mode. - -## Troubleshooting - - - -### Mock responses not working -- Ensure `mock=True` is set when initializing the client -- Check that your mock response paths match the actual API endpoints -- Verify the response format matches the expected schema +Test API endpoints directly using cURL against a local mock server or staging environment: -### Custom handler not being called -- Make sure you're passing the `mock_handler` parameter correctly -- Check that your handler function accepts the correct parameters: `(method, url, kwargs)` -- Ensure the handler returns a valid response object +```bash +# Test extract endpoint +curl -X POST "https://api.scrapegraphai.com/api/v2/extract" \ + -H "Authorization: Bearer your-api-key" \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com", + "prompt": "Extract the title" + }' -### Schema validation errors -- Mock responses must match the expected Pydantic schema structure -- Use the same field names and types as defined in your schema -- Test your mock responses with the actual schema classes +# Test credits endpoint +curl -X GET "https://api.scrapegraphai.com/api/v2/credits" \ + -H "Authorization: Bearer your-api-key" +``` - +## SDK Comparison -## Examples +| Feature | Python | JavaScript | +|---------|--------|------------| +| **Mock library** | `unittest.mock`, `responses` | Jest/Vitest mocks, MSW | +| **HTTP-level mocking** | `responses`, `aioresponses` | MSW (Mock Service Worker) | +| **Async mocking** | `aioresponses`, `unittest.mock` | Native async/await | +| **Fixture support** | pytest fixtures | beforeEach/afterEach | - -Here's a complete example showing all mocking features: +## Best Practices -```python -from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger -from pydantic import BaseModel, Field -from typing import List - -# Set up logging -sgai_logger.set_logging(level="INFO") - -class ProductInfo(BaseModel): - name: str = Field(description="Product name") - price: str = Field(description="Product price") - features: List[str] = Field(description="Product features") - -def complete_mock_demo(): - # Initialize with comprehensive mock responses - client = Client.from_env( - mock=True, - mock_responses={ - "/v1/credits": { - "remaining_credits": 25, - "total_credits_used": 75, - "mock": true - }, - "/v1/smartscraper/start": { - "job_id": "demo-job-789", - "status": "processing", - "mock": true - }, - "/v1/smartscraper/status/demo-job-789": { - "job_id": "demo-job-789", - "status": "completed", - "result": { - "name": "iPhone 15 Pro", - "price": "$999", - "features": [ - "A17 Pro chip", - "48MP camera system", - "Titanium design", - "Action Button" - ], - "mock": true - } - } - } - ) - - print("=== ScrapeGraphAI Mock Demo ===\n") - - # Test credits endpoint - print("1. Checking credits:") - credits = client.get_credits() - print(f" Remaining: {credits['remaining_credits']}") - print(f" Used: {credits['total_credits_used']}\n") - - # Test smartscraper with schema - print("2. Extracting product information:") - product = client.smartscraper( - website_url="https://apple.com/iphone-15-pro", - user_prompt="Extract product name, price, and key features", - output_schema=ProductInfo - ) - - print(f" Product: {product.name}") - print(f" Price: {product.price}") - print(" Features:") - for feature in product.features: - print(f" - {feature}") - - print("\n3. Testing markdownify:") - markdown = client.markdownify(website_url="https://example.com") - print(f" Markdown length: {len(markdown)} characters") - - print("\n=== Demo Complete ===") - -if __name__ == "__main__": - complete_mock_demo() -``` - +- Mock at the **client method level** for unit tests (fastest, simplest) +- Mock at the **HTTP level** for integration tests (validates request/response shapes) +- Use **fixtures** to share mock configurations across tests +- Keep mock responses **realistic** - match the actual API response structure +- Test both **success and error** scenarios ## Support - + Report bugs or request features @@ -596,4 +273,4 @@ if __name__ == "__main__": -Need help with mocking? Check out our [Python SDK documentation](/sdks/python) or join our [Discord community](https://discord.gg/uJN7TYcpNa) for support. +Need help with testing? Join our [Discord community](https://discord.gg/uJN7TYcpNa) for support. diff --git a/sdks/python.mdx b/sdks/python.mdx index 43da3f2..19b4b51 100644 --- a/sdks/python.mdx +++ b/sdks/python.mdx @@ -1,6 +1,6 @@ --- title: 'Python SDK' -description: 'Official Python SDK for ScrapeGraphAI' +description: 'Official Python SDK for ScrapeGraphAI v2' icon: 'python' --- @@ -21,23 +21,23 @@ icon: 'python' ## Installation -Install the package using pip: - ```bash pip install scrapegraph-py ``` -## Features +## What's New in v2 -- **AI-Powered Extraction**: Advanced web scraping using artificial intelligence -- **Flexible Clients**: Both synchronous and asynchronous support -- **Type Safety**: Structured output with Pydantic schemas -- **Production Ready**: Detailed logging and automatic retries -- **Developer Friendly**: Comprehensive error handling +- **Renamed methods**: `smartscraper()` → `extract()`, `searchscraper()` → `search()` +- **Unified config objects**: `FetchConfig` and `LlmConfig` replace scattered parameters +- **Namespace methods**: `crawl.start()`, `crawl.status()`, `monitor.create()`, etc. +- **New endpoints**: `credits()`, `history()`, `crawl.stop()`, `crawl.resume()` +- **Removed**: `markdownify()`, `agenticscraper()`, `sitemap()`, `healthz()`, `feedback()`, built-in mock mode -## Quick Start + +v2 is a breaking release. If you're upgrading from v1, see the [Migration Guide](https://github.com/ScrapeGraphAI/scrapegraph-py/blob/main/MIGRATION_V2.md). + -Initialize the client with your API key: +## Quick Start ```python from scrapegraph_py import Client @@ -49,30 +49,42 @@ client = Client(api_key="your-api-key-here") You can also set the `SGAI_API_KEY` environment variable and initialize the client without parameters: `client = Client()` +### Client Options + +| Parameter | Type | Default | Description | +| ------------- | ------ | -------------------------------- | ------------------------------- | +| api_key | string | `SGAI_API_KEY` env var | Your ScrapeGraphAI API key | +| base_url | string | `https://api.scrapegraphai.com` | API base URL | +| verify_ssl | bool | `True` | Verify SSL certificates | +| timeout | int | `30` | Request timeout in seconds | +| max_retries | int | `3` | Maximum number of retries | +| retry_delay | float | `1.0` | Delay between retries (seconds) | + ## Services -### SmartScraper +### Extract -Extract specific information from any webpage using AI: +Extract structured data from any webpage using AI. Replaces the v1 `smartscraper()` method. ```python -response = client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the main heading and description" +response = client.extract( + url="https://example.com", + prompt="Extract the main heading and description" ) +print(response) ``` #### Parameters -| Parameter | Type | Required | Description | -| ---------------- | ------- | -------- | ---------------------------------------------------------------------------------- | -| website_url | string | Yes | The URL of the webpage that needs to be scraped. | -| user_prompt | string | Yes | A textual description of what you want to achieve. | -| output_schema | object | No | The Pydantic object that describes the structure and format of the response. | - - -Define a simple schema for basic data extraction: +| Parameter | Type | Required | Description | +| ------------ | ----------- | -------- | -------------------------------------------------------- | +| url | string | Yes | The URL of the webpage to scrape | +| prompt | string | Yes | A description of what you want to extract | +| output_schema| object | No | Pydantic model for structured response | +| fetch_config | FetchConfig | No | Fetch configuration (stealth, rendering, etc.) | +| llm_config | LlmConfig | No | LLM configuration (model, temperature, etc.) | + ```python from pydantic import BaseModel, Field @@ -81,93 +93,60 @@ class ArticleData(BaseModel): author: str = Field(description="The author's name") publish_date: str = Field(description="Article publication date") content: str = Field(description="Main article content") - category: str = Field(description="Article category") -response = client.smartscraper( - website_url="https://example.com/blog/article", - user_prompt="Extract the article information", +response = client.extract( + url="https://example.com/blog/article", + prompt="Extract the article information", output_schema=ArticleData ) -print(f"Title: {response.title}") -print(f"Author: {response.author}") -print(f"Published: {response.publish_date}") +print(f"Title: {response['data']['title']}") +print(f"Author: {response['data']['author']}") ``` - -Define a complex schema for nested data structures: - + ```python -from typing import List -from pydantic import BaseModel, Field - -class Employee(BaseModel): - name: str = Field(description="Employee's full name") - position: str = Field(description="Job title") - department: str = Field(description="Department name") - email: str = Field(description="Email address") - -class Office(BaseModel): - location: str = Field(description="Office location/city") - address: str = Field(description="Full address") - phone: str = Field(description="Contact number") - -class CompanyData(BaseModel): - name: str = Field(description="Company name") - description: str = Field(description="Company description") - industry: str = Field(description="Industry sector") - founded_year: int = Field(description="Year company was founded") - employees: List[Employee] = Field(description="List of key employees") - offices: List[Office] = Field(description="Company office locations") - website: str = Field(description="Company website URL") - -# Extract comprehensive company information -response = client.smartscraper( - website_url="https://example.com/about", - user_prompt="Extract detailed company information including employees and offices", - output_schema=CompanyData +from scrapegraph_py import FetchConfig, LlmConfig + +response = client.extract( + url="https://example.com", + prompt="Extract the main heading", + fetch_config=FetchConfig( + stealth=True, + render_js=True, + wait_ms=2000, + scrolls=3, + ), + llm_config=LlmConfig( + temperature=0.3, + max_tokens=1000, + ), ) - -# Access nested data -print(f"Company: {response.name}") -print("\nKey Employees:") -for employee in response.employees: - print(f"- {employee.name} ({employee.position})") - -print("\nOffice Locations:") -for office in response.offices: - print(f"- {office.location}: {office.address}") ``` -### SearchScraper +### Search -Search and extract information from multiple web sources using AI: +Search the web and extract information from multiple sources. Replaces the v1 `searchscraper()` method. ```python -from scrapegraph_py.models import TimeRange - -response = client.searchscraper( - user_prompt="What are the key features and pricing of ChatGPT Plus?", - time_range=TimeRange.PAST_WEEK # Optional: Filter results by time range +response = client.search( + query="What are the key features and pricing of ChatGPT Plus?" ) ``` #### Parameters -| Parameter | Type | Required | Description | -| ---------------- | ------- | -------- | ---------------------------------------------------------------------------------- | -| user_prompt | string | Yes | A textual description of what you want to achieve. | -| num_results | number | No | Number of websites to search (3-20). Default: 3. | -| extraction_mode | boolean | No | **True** = AI extraction mode (10 credits/page), **False** = markdown mode (2 credits/page). Default: True | -| output_schema | object | No | The Pydantic object that describes the structure and format of the response (AI extraction mode only) | -| location_geo_code| string | No | Optional geo code for location-based search (e.g., "us") | -| time_range | TimeRange| No | Optional time range filter for search results. Options: TimeRange.PAST_HOUR, TimeRange.PAST_24_HOURS, TimeRange.PAST_WEEK, TimeRange.PAST_MONTH, TimeRange.PAST_YEAR | - - -Define a simple schema for structured search results: +| Parameter | Type | Required | Description | +| ------------- | ----------- | -------- | -------------------------------------------------------- | +| query | string | Yes | The search query | +| num_results | number | No | Number of results (3-20). Default: 3 | +| output_schema | object | No | Pydantic model for structured response | +| fetch_config | FetchConfig | No | Fetch configuration | +| llm_config | LlmConfig | No | LLM configuration | + ```python from pydantic import BaseModel, Field from typing import List @@ -177,174 +156,153 @@ class ProductInfo(BaseModel): description: str = Field(description="Product description") price: str = Field(description="Product price") features: List[str] = Field(description="List of key features") - availability: str = Field(description="Availability information") -from scrapegraph_py.models import TimeRange - -response = client.searchscraper( - user_prompt="Find information about iPhone 15 Pro", +response = client.search( + query="Find information about iPhone 15 Pro", output_schema=ProductInfo, - location_geo_code="us", # Optional: Geo code for location-based search - time_range=TimeRange.PAST_MONTH # Optional: Filter results by time range + num_results=5, ) -print(f"Product: {response.name}") -print(f"Price: {response.price}") -print("\nFeatures:") -for feature in response.features: - print(f"- {feature}") +print(f"Product: {response['data']['name']}") +print(f"Price: {response['data']['price']}") ``` - -Define a complex schema for comprehensive market research: +### Scrape -```python -from typing import List -from pydantic import BaseModel, Field +Convert any webpage into markdown, HTML, screenshot, or branding format. -class MarketPlayer(BaseModel): - name: str = Field(description="Company name") - market_share: str = Field(description="Market share percentage") - key_products: List[str] = Field(description="Main products in market") - strengths: List[str] = Field(description="Company's market strengths") - -class MarketTrend(BaseModel): - name: str = Field(description="Trend name") - description: str = Field(description="Trend description") - impact: str = Field(description="Expected market impact") - timeframe: str = Field(description="Trend timeframe") - -class MarketAnalysis(BaseModel): - market_size: str = Field(description="Total market size") - growth_rate: str = Field(description="Annual growth rate") - key_players: List[MarketPlayer] = Field(description="Major market players") - trends: List[MarketTrend] = Field(description="Market trends") - challenges: List[str] = Field(description="Industry challenges") - opportunities: List[str] = Field(description="Market opportunities") - -from scrapegraph_py.models import TimeRange - -# Perform comprehensive market research -response = client.searchscraper( - user_prompt="Analyze the current AI chip market landscape", - output_schema=MarketAnalysis, - location_geo_code="us", # Optional: Geo code for location-based search - time_range=TimeRange.PAST_MONTH # Optional: Filter results by time range +```python +response = client.scrape( + url="https://example.com" ) - -# Access structured market data -print(f"Market Size: {response.market_size}") -print(f"Growth Rate: {response.growth_rate}") - -print("\nKey Players:") -for player in response.key_players: - print(f"\n{player.name}") - print(f"Market Share: {player.market_share}") - print("Key Products:") - for product in player.key_products: - print(f"- {product}") - -print("\nMarket Trends:") -for trend in response.trends: - print(f"\n{trend.name}") - print(f"Impact: {trend.impact}") - print(f"Timeframe: {trend.timeframe}") ``` - - -Use markdown mode for cost-effective content gathering: +#### Parameters -```python -from scrapegraph_py import Client +| Parameter | Type | Required | Description | +| ------------- | ----------- | -------- | -------------------------------------------------------- | +| url | string | Yes | The URL of the webpage to scrape | +| output_format | string | No | Output format: `"markdown"`, `"html"`, `"screenshot"`, `"branding"` | +| fetch_config | FetchConfig | No | Fetch configuration | -client = Client(api_key="your-api-key") +### Crawl -from scrapegraph_py.models import TimeRange +Manage multi-page crawl operations asynchronously. -# Enable markdown mode for cost-effective content gathering -response = client.searchscraper( - user_prompt="Latest developments in artificial intelligence", - num_results=3, - extraction_mode=False, # Enable markdown mode (2 credits per page vs 10 credits) - location_geo_code="us", # Optional: Geo code for location-based search - time_range=TimeRange.PAST_WEEK # Optional: Filter results by time range +```python +# Start a crawl +job = client.crawl.start( + url="https://example.com", + depth=2, + include_patterns=["/blog/*", "/docs/**"], + exclude_patterns=["/admin/*", "/api/*"], ) +print(f"Crawl started: {job['id']}") -# Access the raw markdown content -markdown_content = response['markdown_content'] -reference_urls = response['reference_urls'] +# Check status +status = client.crawl.status(job["id"]) +print(f"Status: {status['status']}") -print(f"Markdown content length: {len(markdown_content)} characters") -print(f"Reference URLs: {len(reference_urls)}") +# Stop a crawl +client.crawl.stop(job["id"]) -# Process the markdown content -print("Content preview:", markdown_content[:500] + "...") +# Resume a crawl +client.crawl.resume(job["id"]) +``` -# Save to file for analysis -with open('ai_research_content.md', 'w', encoding='utf-8') as f: - f.write(markdown_content) +#### crawl.start() Parameters -print("Content saved to ai_research_content.md") -``` +| Parameter | Type | Required | Description | +| ---------------- | ----------- | -------- | -------------------------------------------------------- | +| url | string | Yes | The starting URL to crawl | +| depth | int | No | Crawl depth level | +| include_patterns | list[str] | No | URL patterns to include (`*` any chars, `**` any path) | +| exclude_patterns | list[str] | No | URL patterns to exclude | +| fetch_config | FetchConfig | No | Fetch configuration | -**Markdown Mode Benefits:** -- **Cost-effective**: Only 2 credits per page (vs 10 credits for AI extraction) -- **Full content**: Get complete page content in markdown format -- **Faster**: No AI processing overhead -- **Perfect for**: Content analysis, bulk data collection, building datasets +### Monitor - +Create and manage site monitoring jobs. + +```python +# Create a monitor +monitor = client.monitor.create( + url="https://example.com", + prompt="Track price changes", + schedule="daily", +) + +# List all monitors +monitors = client.monitor.list() + +# Get a specific monitor +details = client.monitor.get(monitor["id"]) + +# Pause / Resume / Delete +client.monitor.pause(monitor["id"]) +client.monitor.resume(monitor["id"]) +client.monitor.delete(monitor["id"]) +``` + +### Credits - -Filter search results by date range to get only recent information: +Check your account credit balance. ```python -from scrapegraph_py import Client -from scrapegraph_py.models import TimeRange +credits = client.credits() +print(f"Remaining: {credits['remaining_credits']}") +print(f"Used: {credits['total_credits_used']}") +``` -client = Client(api_key="your-api-key") +### History -# Search for recent news from the past week -response = client.searchscraper( - user_prompt="Latest news about AI developments", - num_results=5, - time_range=TimeRange.PAST_WEEK # Options: PAST_HOUR, PAST_24_HOURS, PAST_WEEK, PAST_MONTH, PAST_YEAR -) +Retrieve paginated request history with optional service filtering. -print("Recent AI news:", response['result']) -print("Reference URLs:", response['reference_urls']) +```python +history = client.history(page=1, per_page=20, service="extract") +for entry in history["items"]: + print(f"{entry['created_at']} - {entry['service']} - {entry['status']}") ``` -**Time Range Options:** -- `TimeRange.PAST_HOUR` - Results from the past hour -- `TimeRange.PAST_24_HOURS` - Results from the past 24 hours -- `TimeRange.PAST_WEEK` - Results from the past week -- `TimeRange.PAST_MONTH` - Results from the past month -- `TimeRange.PAST_YEAR` - Results from the past year +## Configuration Objects -**Use Cases:** -- Finding recent news and updates -- Tracking time-sensitive information -- Getting latest product releases -- Monitoring recent market changes +### FetchConfig - +Controls how pages are fetched. -### Markdownify +```python +from scrapegraph_py import FetchConfig + +config = FetchConfig( + stealth=True, # Anti-detection mode + render_js=True, # Render JavaScript + wait_ms=2000, # Wait time after page load (ms) + scrolls=3, # Number of scrolls + country="us", # Proxy country code + cookies={"key": "value"}, + headers={"X-Custom": "header"}, +) +``` -Convert any webpage into clean, formatted markdown: +### LlmConfig + +Controls LLM behavior for AI-powered methods. ```python -response = client.markdownify( - website_url="https://example.com" +from scrapegraph_py import LlmConfig + +config = LlmConfig( + model="default", # LLM model to use + temperature=0.3, # Response creativity (0-1) + max_tokens=1000, # Maximum response tokens + chunker="auto", # Content chunking strategy ) ``` ## Async Support -All endpoints support asynchronous operations: +All methods are available on the async client: ```python import asyncio @@ -352,38 +310,32 @@ from scrapegraph_py import AsyncClient async def main(): async with AsyncClient() as client: - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Extract the main content" + # Extract + response = await client.extract( + url="https://example.com", + prompt="Extract the main content" ) print(response) -asyncio.run(main()) -``` - -## Feedback + # Crawl + job = await client.crawl.start("https://example.com", depth=2) + status = await client.crawl.status(job["id"]) + print(status) -Help us improve by submitting feedback programmatically: + # Credits + credits = await client.credits() + print(credits) -```python -client.submit_feedback( - request_id="your-request-id", - rating=5, - feedback_text="Great results!" -) +asyncio.run(main()) ``` ## Support - + Report issues and contribute to the SDK Get help from our development team - - - This project is licensed under the MIT License. See the [LICENSE](https://github.com/ScrapeGraphAI/scrapegraph-sdk/blob/main/LICENSE) file for details. - diff --git a/services/cli/ai-agent-skill.mdx b/services/cli/ai-agent-skill.mdx index 50ee527..5ecd435 100644 --- a/services/cli/ai-agent-skill.mdx +++ b/services/cli/ai-agent-skill.mdx @@ -17,9 +17,10 @@ Browse the skill: [skills.sh/scrapegraphai/just-scrape/just-scrape](https://skil Once installed, your coding agent can: -- Scrape a website to gather data needed for a task +- Extract structured data from any website using AI - Convert documentation pages to markdown for context - Search the web and extract structured results +- Crawl multiple pages and collect data - Check your credit balance mid-session - Browse request history @@ -28,13 +29,13 @@ Once installed, your coding agent can: Agents call `just-scrape` in `--json` mode for clean, token-efficient output: ```bash -just-scrape smart-scraper https://api.example.com/docs \ +just-scrape extract https://api.example.com/docs \ -p "Extract all endpoint names, methods, and descriptions" \ --json ``` ```bash -just-scrape search-scraper "latest release notes for react-query" \ +just-scrape search "latest release notes for react-query" \ --num-results 3 --json ``` @@ -76,12 +77,11 @@ This project uses `just-scrape` (ScrapeGraph AI CLI) for web scraping. The API key is set via the SGAI_API_KEY environment variable. Available commands (always use --json flag): -- `just-scrape smart-scraper -p --json` — AI extraction from a URL -- `just-scrape search-scraper --json` — search the web and extract data +- `just-scrape extract -p --json` — AI extraction from a URL +- `just-scrape search --json` — search the web and extract data - `just-scrape markdownify --json` — convert a page to markdown -- `just-scrape crawl -p --json` — crawl multiple pages -- `just-scrape scrape --json` — get raw HTML -- `just-scrape sitemap --json` — get all URLs from a sitemap +- `just-scrape crawl --json` — crawl multiple pages +- `just-scrape scrape --json` — get page content (markdown, html, screenshot, branding) Use --schema to enforce a JSON schema on the output. Use --stealth for sites with anti-bot protection. @@ -120,7 +120,7 @@ claude -p "Use just-scrape to scrape https://example.com/changelog \ - Pass `--schema` with a JSON schema to get typed, predictable output: ```bash -just-scrape smart-scraper https://example.com \ +just-scrape extract https://example.com \ -p "Extract company info" \ --schema '{"type":"object","properties":{"name":{"type":"string"},"founded":{"type":"number"}}}' \ --json diff --git a/services/cli/commands.mdx b/services/cli/commands.mdx index 566f827..4b4a54d 100644 --- a/services/cli/commands.mdx +++ b/services/cli/commands.mdx @@ -3,35 +3,34 @@ title: 'Commands' description: 'Full reference for every just-scrape command and its flags' --- -## smart-scraper +## extract -Extract structured data from any URL using AI. [Full docs →](/services/smartscraper) +Extract structured data from any URL using AI. [Full docs →](/services/extract) ```bash -just-scrape smart-scraper -p -just-scrape smart-scraper -p --schema -just-scrape smart-scraper -p --scrolls # infinite scroll (0-100) -just-scrape smart-scraper -p --pages # multi-page (1-100) -just-scrape smart-scraper -p --stealth # anti-bot bypass (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text # plain text instead of JSON +just-scrape extract -p +just-scrape extract -p --schema +just-scrape extract -p --scrolls # infinite scroll (0-100) +just-scrape extract -p --stealth # anti-bot bypass (+4 credits) +just-scrape extract -p --cookies --headers +just-scrape extract -p --country # geo-targeting ``` -## search-scraper +## search -Search the web and extract structured data from results. [Full docs →](/services/searchscraper) +Search the web and extract structured data from results. [Full docs →](/services/search) ```bash -just-scrape search-scraper -just-scrape search-scraper --num-results # sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # markdown only (2 credits vs 10) -just-scrape search-scraper --schema -just-scrape search-scraper --stealth --headers +just-scrape search +just-scrape search -p # extraction prompt for results +just-scrape search --num-results # sources to scrape (1-20, default 3) +just-scrape search --schema +just-scrape search --headers ``` ## markdownify -Convert any webpage to clean markdown. [Full docs →](/services/markdownify) +Convert any webpage to clean markdown (uses `scrape` with `format=markdown` internally). [Full docs →](/services/scrape) ```bash just-scrape markdownify @@ -39,74 +38,44 @@ just-scrape markdownify --stealth just-scrape markdownify --headers ``` -## crawl - -Crawl multiple pages and extract data from each. [Full docs →](/services/smartcrawler) - -```bash -just-scrape crawl -p -just-scrape crawl -p --max-pages # max pages (default 10) -just-scrape crawl -p --depth # crawl depth (default 1) -just-scrape crawl --no-extraction --max-pages # markdown only (2 credits/page) -just-scrape crawl -p --schema -just-scrape crawl -p --rules # include_paths, same_domain -just-scrape crawl -p --no-sitemap # skip sitemap discovery -just-scrape crawl -p --stealth -``` - ## scrape -Get raw HTML content from a URL. [Full docs →](/services/scrape) +Scrape content from a URL in your preferred format. [Full docs →](/services/scrape) ```bash just-scrape scrape -just-scrape scrape --stealth # anti-bot bypass (+4 credits) -just-scrape scrape --branding # extract branding (+2 credits) -just-scrape scrape --country-code # geo-targeting -``` - -## sitemap - -Get all URLs from a website's sitemap. [Full docs →](/services/sitemap) - -```bash -just-scrape sitemap -just-scrape sitemap --json | jq -r '.urls[]' +just-scrape scrape -f html # output as HTML +just-scrape scrape -f screenshot # take a screenshot +just-scrape scrape -f branding # extract branding info +just-scrape scrape --stealth # anti-bot bypass (+4 credits) +just-scrape scrape --country # geo-targeting ``` -## agentic-scraper - -Browser automation with AI — login, click, navigate, fill forms. [Full docs →](/services/agenticscraper) - -```bash -just-scrape agentic-scraper -s -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # persist browser session -``` - -## generate-schema +## crawl -Generate a JSON schema from a natural language description. +Crawl multiple pages starting from a URL. [Full docs →](/services/crawl) ```bash -just-scrape generate-schema -just-scrape generate-schema --existing-schema +just-scrape crawl +just-scrape crawl --max-pages # max pages (default 50) +just-scrape crawl --max-depth # crawl depth (default 2) +just-scrape crawl --max-links-per-page # max links per page (default 10) +just-scrape crawl --allow-external # allow external domains +just-scrape crawl --stealth ``` ## history -Browse request history for any service. Interactive by default — arrow keys to navigate, select to view details. +View request history for a service. Interactive by default — arrow keys to navigate, select to view details. ```bash just-scrape history -just-scrape history -just-scrape history --page # start from page (default 1) -just-scrape history --page-size # results per page (max 100) +just-scrape history --page # start from page (default 1) +just-scrape history --page-size # results per page (max 100) just-scrape history --json ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` +Services: `scrape`, `extract`, `search`, `monitor`, `crawl` ## credits @@ -117,14 +86,6 @@ just-scrape credits just-scrape credits --json | jq '.remaining_credits' ``` -## validate - -Validate your API key. - -```bash -just-scrape validate -``` - ## Global flags All commands support these flags: diff --git a/services/cli/examples.mdx b/services/cli/examples.mdx index 68c7e3a..6461a24 100644 --- a/services/cli/examples.mdx +++ b/services/cli/examples.mdx @@ -3,39 +3,39 @@ title: 'Examples' description: 'Practical examples for every just-scrape command' --- -## smart-scraper +## extract ```bash # Extract product listings -just-scrape smart-scraper https://store.example.com/shoes \ +just-scrape extract https://store.example.com/shoes \ -p "Extract all product names, prices, and ratings" # Enforce output schema + scroll to load more content -just-scrape smart-scraper https://news.example.com \ +just-scrape extract https://news.example.com \ -p "Get all article headlines and dates" \ --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 # Anti-bot bypass for JS-heavy SPAs -just-scrape smart-scraper https://app.example.com/dashboard \ +just-scrape extract https://app.example.com/dashboard \ -p "Extract user stats" \ --stealth ``` -## search-scraper +## search ```bash # Research across multiple sources -just-scrape search-scraper "What are the best Python web frameworks in 2025?" \ +just-scrape search "What are the best Python web frameworks in 2025?" \ --num-results 10 -# Get raw markdown only (cheaper — 2 credits vs 10) -just-scrape search-scraper "React vs Vue comparison" \ - --no-extraction --num-results 5 - # Structured output with schema -just-scrape search-scraper "Top 5 cloud providers pricing" \ +just-scrape search "Top 5 cloud providers pricing" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' + +# With extraction prompt +just-scrape search "React vs Vue comparison" \ + -p "Summarize the key differences" ``` ## markdownify @@ -52,87 +52,51 @@ just-scrape markdownify https://docs.example.com/api \ just-scrape markdownify https://protected.example.com --stealth ``` -## crawl - -```bash -# Crawl a docs site and collect code examples -just-scrape crawl https://docs.example.com \ - -p "Extract all code snippets with their language" \ - --max-pages 20 --depth 3 - -# Crawl only blog pages, skip everything else -just-scrape crawl https://example.com \ - -p "Extract article titles and summaries" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' \ - --max-pages 50 - -# Raw markdown from all pages (no AI extraction, cheaper) -just-scrape crawl https://example.com \ - --no-extraction --max-pages 10 -``` - ## scrape ```bash -# Get raw HTML +# Get markdown (default format) just-scrape scrape https://example.com -# Geo-targeted + anti-bot bypass -just-scrape scrape https://store.example.com \ - --stealth --country-code DE - -# Extract branding info (logos, colors, fonts) -just-scrape scrape https://example.com --branding -``` +# Get raw HTML +just-scrape scrape https://example.com -f html -## sitemap +# Take a screenshot +just-scrape scrape https://example.com -f screenshot -```bash -# List all pages on a site -just-scrape sitemap https://example.com +# Extract branding info (logos, colors, fonts) +just-scrape scrape https://example.com -f branding -# Pipe URLs to another tool -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Geo-targeted + anti-bot bypass +just-scrape scrape https://store.example.com \ + --stealth --country DE ``` -## agentic-scraper +## crawl ```bash -# Log in and extract dashboard data -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Navigate a multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent browser session across multiple runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session -``` - -## generate-schema +# Crawl a docs site +just-scrape crawl https://docs.example.com \ + --max-pages 20 --max-depth 3 -```bash -# Generate a schema from a description -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" +# Allow external links +just-scrape crawl https://example.com \ + --max-pages 50 --allow-external -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' +# Stealth mode for protected sites +just-scrape crawl https://example.com --stealth ``` ## history ```bash # Interactive history browser -just-scrape history smartscraper +just-scrape history extract -# Fetch a specific request by ID -just-scrape history smartscraper abc123-def456-7890 +# Export last 100 extract jobs as JSON +just-scrape history extract --json --page-size 100 \ + | jq '.[] | {id: .request_id, status}' -# Export last 100 crawl jobs as JSON -just-scrape history crawl --json --page-size 100 \ - | jq '.requests[] | {id: .request_id, status}' +# Browse crawl history +just-scrape history crawl --json ``` diff --git a/services/cli/introduction.mdx b/services/cli/introduction.mdx index 7ccc61b..c36aac6 100644 --- a/services/cli/introduction.mdx +++ b/services/cli/introduction.mdx @@ -56,21 +56,20 @@ The CLI needs a ScrapeGraph API key. Four ways to provide it (checked in order): | Variable | Description | Default | |---|---|---| | `SGAI_API_KEY` | ScrapeGraph API key | — | -| `JUST_SCRAPE_API_URL` | Override API base URL | `https://api.scrapegraphai.com/v1` | +| `JUST_SCRAPE_API_URL` | Override API base URL | `https://api.scrapegraphai.com/api/v2` | | `JUST_SCRAPE_TIMEOUT_S` | Request/polling timeout in seconds | `120` | | `JUST_SCRAPE_DEBUG` | Set to `1` to enable debug logging | `0` | ## Verify your setup ```bash -just-scrape validate # check your API key just-scrape credits # check your credit balance ``` ## Quick start ```bash -just-scrape smart-scraper https://news.ycombinator.com \ +just-scrape extract https://news.ycombinator.com \ -p "Extract the top 5 story titles and their URLs" ``` diff --git a/services/cli/json-mode.mdx b/services/cli/json-mode.mdx index 60d13c8..4d793f5 100644 --- a/services/cli/json-mode.mdx +++ b/services/cli/json-mode.mdx @@ -23,7 +23,7 @@ just-scrape [args] --json ### Save results to a file ```bash -just-scrape smart-scraper https://store.example.com \ +just-scrape extract https://store.example.com \ -p "Extract all product names and prices" \ --json > products.json ``` @@ -33,9 +33,7 @@ just-scrape smart-scraper https://store.example.com \ ```bash just-scrape credits --json | jq '.remaining_credits' -just-scrape sitemap https://example.com --json | jq -r '.urls[]' - -just-scrape history smartscraper --json | jq '.requests[] | {id: .request_id, status}' +just-scrape history extract --json | jq '.[] | {id: .request_id, status}' ``` ### Convert a page to markdown and save it @@ -50,7 +48,7 @@ just-scrape markdownify https://docs.example.com/api \ ```bash #!/bin/bash while IFS= read -r url; do - just-scrape smart-scraper "$url" \ + just-scrape extract "$url" \ -p "Extract the page title and main content" \ --json >> results.jsonl done < urls.txt diff --git a/services/crawl.mdx b/services/crawl.mdx new file mode 100644 index 0000000..753c00e --- /dev/null +++ b/services/crawl.mdx @@ -0,0 +1,189 @@ +--- +title: 'Crawl' +description: 'Multi-page website crawling with flexible output formats' +icon: 'spider' +--- + +## Overview + +Crawl is an advanced web crawling service that traverses multiple pages, follows links, and returns content in your preferred format (markdown or HTML). It provides namespaced operations for starting, monitoring, stopping, and resuming crawl jobs. + + +Try Crawl instantly in our [interactive playground](https://dashboard.scrapegraphai.com/) + + +## Getting Started + +### Quick Start + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +# Start a crawl +response = client.crawl.start( + "https://example.com", + depth=2, + max_pages=10, + format="markdown", +) +print("Crawl started:", response) +``` + +```javascript JavaScript +import { crawlStart, crawlStatus } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; + +const response = await crawlStart(apiKey, { + url: 'https://example.com', + depth: 2, + max_pages: 10, + format: 'markdown', +}); + +console.log('Crawl started:', response); +``` + +```bash cURL +curl -X POST https://api.scrapegraphai.com/api/v2/crawl \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "url": "https://example.com", + "depth": 2, + "max_pages": 10, + "format": "markdown" + }' +``` + + + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| url | string | Yes | The starting URL to crawl. | +| depth | int | No | How many levels deep to follow links. | +| max_pages | int | No | Maximum number of pages to crawl. | +| format | string | No | Output format: `"markdown"` or `"html"`. Default: `"markdown"`. | +| include_patterns | list | No | URL patterns to include (e.g., `["/blog/*"]`). | +| exclude_patterns | list | No | URL patterns to exclude (e.g., `["/admin/*"]`). | +| fetch_config | FetchConfig | No | Configuration for page fetching (headers, stealth, etc.). | + + +Get your API key from the [dashboard](https://dashboard.scrapegraphai.com) + + +## Managing Crawl Jobs + +### Check Status + +```python +status = client.crawl.status(crawl_id) +print("Status:", status) +``` + +### Stop a Running Crawl + +```python +client.crawl.stop(crawl_id) +``` + +### Resume a Stopped Crawl + +```python +client.crawl.resume(crawl_id) +``` + +## Advanced Usage + +### With FetchConfig + +```python +from scrapegraph_py import Client, FetchConfig + +client = Client(api_key="your-api-key") + +response = client.crawl.start( + "https://example.com", + depth=2, + max_pages=10, + format="markdown", + include_patterns=["/blog/*"], + exclude_patterns=["/admin/*"], + fetch_config=FetchConfig( + render_js=True, + stealth=True, + wait_ms=1000, + headers={"User-Agent": "MyBot"}, + ), +) +``` + +### Async Support + +```python +import asyncio +from scrapegraph_py import AsyncClient + +async def main(): + async with AsyncClient(api_key="your-api-key") as client: + job = await client.crawl.start( + "https://example.com", + depth=2, + max_pages=5, + ) + + status = await client.crawl.status(job["id"]) + print("Crawl status:", status) + +asyncio.run(main()) +``` + +## Key Features + + + + Traverse entire websites following links automatically + + + Get results in markdown or HTML format + + + Start, stop, resume, and monitor crawl jobs + + + Include or exclude pages by URL patterns + + + +## Integration Options + +### Official SDKs +- [Python SDK](/sdks/python) - Perfect for data science and backend applications +- [JavaScript SDK](/sdks/javascript) - Ideal for web applications and Node.js + +### AI Framework Integrations +- [LangChain Integration](/integrations/langchain) - Use Crawl in your LLM workflows +- [LlamaIndex Integration](/integrations/llamaindex) - Build powerful search and QA systems + +## Support & Resources + + + + Comprehensive guides and tutorials + + + Detailed API documentation + + + Join our Discord community + + + Check out our open-source projects + + diff --git a/services/extract.mdx b/services/extract.mdx new file mode 100644 index 0000000..e644b71 --- /dev/null +++ b/services/extract.mdx @@ -0,0 +1,254 @@ +--- +title: 'Extract' +description: 'AI-powered structured data extraction from any webpage' +icon: 'robot' +--- + + + Extract Service + + +## Overview + +Extract is our flagship LLM-powered web scraping service that intelligently extracts structured data from any website. Using advanced LLM models, it understands context and content like a human would, making web data extraction more reliable and efficient than ever. + + +Try Extract instantly in our [interactive playground](https://dashboard.scrapegraphai.com/) + + +## Getting Started + +### Quick Start + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.extract( + url="https://scrapegraphai.com/", + prompt="Extract info about the company" +) +``` + +```javascript JavaScript +import { extract } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; + +const response = await extract(apiKey, { + url: 'https://scrapegraphai.com', + prompt: 'What does the company do?', +}); + +console.log(response); +``` + +```bash cURL +curl -X POST https://api.scrapegraphai.com/api/v2/extract \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "url": "https://example.com", + "prompt": "Extract product details including name, price, and availability." + }' +``` + + + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| url | string | Yes | The URL of the webpage to scrape. | +| prompt | string | Yes | A textual description of what you want to extract. | +| output_schema | object | No | Pydantic or Zod schema for structured response format. | +| fetch_config | FetchConfig | No | Configuration for page fetching (headers, cookies, stealth, etc.). | + + +Get your API key from the [dashboard](https://dashboard.scrapegraphai.com) + + + +```json +{ + "id": "sg-req-abc123", + "status": "completed", + "result": { + "company_name": "ScrapeGraphAI", + "description": "ScrapeGraphAI is a powerful AI scraping API designed for efficient web data extraction...", + "features": [ + "Effortless, cost-effective, and AI-powered data extraction", + "Handles proxy rotation and rate limits", + "Supports a wide variety of websites" + ] + } +} +``` + + +## FetchConfig + +Use `FetchConfig` to control how the page is fetched: + +```python +from scrapegraph_py import Client, FetchConfig + +client = Client(api_key="your-api-key") + +response = client.extract( + url="https://example.com", + prompt="Extract the main content", + fetch_config=FetchConfig( + headers={"User-Agent": "MyBot"}, + cookies={"session": "abc123"}, + scrolls=3, + render_js=True, + stealth=True, + wait_ms=2000, + ), +) +``` + +| Parameter | Type | Description | +|-----------|------|-------------| +| headers | dict | Custom HTTP headers to send. | +| cookies | dict | Cookies to include in the request. | +| scrolls | int | Number of page scrolls (0-100). | +| render_js | bool | Render heavy JavaScript before extraction. | +| stealth | bool | Enable stealth mode to avoid bot detection. | +| wait_ms | int | Milliseconds to wait before capturing page content. | +| country | string | Two-letter ISO country code for geo-targeted proxy routing. | + +## Custom Schema Example + +Define exactly what data you want to extract: + + + +```python Python +from pydantic import BaseModel, Field +from scrapegraph_py import Client + +class ArticleData(BaseModel): + title: str = Field(description="Article title") + author: str = Field(description="Author name") + content: str = Field(description="Main article content") + publish_date: str = Field(description="Publication date") + +client = Client(api_key="your-api-key") + +response = client.extract( + url="https://example.com/article", + prompt="Extract the article information", + output_schema=ArticleData +) +``` + +```javascript JavaScript +import { extract } from 'scrapegraph-js'; +import { z } from 'zod'; + +const apiKey = 'your-api-key'; + +const schema = z.object({ + title: z.string().describe('The title of the webpage'), + description: z.string().describe('The description of the webpage'), + summary: z.string().describe('A brief summary of the webpage'), +}); + +const response = await extract(apiKey, { + url: 'https://scrapegraphai.com/', + prompt: 'What does the company do?', + output_schema: schema, +}); + +console.log(response); +``` + + + +## Async Support + +For applications requiring asynchronous execution: + + + +```python Python +import asyncio +from scrapegraph_py import AsyncClient + +async def main(): + async with AsyncClient(api_key="your-api-key") as client: + urls = [ + "https://scrapegraphai.com/", + "https://github.com/ScrapeGraphAI/Scrapegraph-ai", + ] + + tasks = [ + client.extract( + url=url, + prompt="Summarize the main content", + ) + for url in urls + ] + + responses = await asyncio.gather(*tasks, return_exceptions=True) + + for i, response in enumerate(responses): + if isinstance(response, Exception): + print(f"Error for {urls[i]}: {response}") + else: + print(f"Result for {urls[i]}: {response}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + + + +## Key Features + + + + Works with any website structure, including JavaScript-rendered content + + + Contextual understanding of content for accurate extraction + + + Returns clean, structured data in your preferred format + + + Define custom output schemas using Pydantic or Zod + + + +## Integration Options + +### Official SDKs +- [Python SDK](/sdks/python) - Perfect for data science and backend applications +- [JavaScript SDK](/sdks/javascript) - Ideal for web applications and Node.js + +### AI Framework Integrations +- [LangChain Integration](/integrations/langchain) - Use Extract in your LLM workflows +- [LlamaIndex Integration](/integrations/llamaindex) - Build powerful search and QA systems + +## Support & Resources + + + + Comprehensive guides and tutorials + + + Detailed API documentation + + + Join our Discord community + + + Check out our open-source projects + + diff --git a/services/monitor.mdx b/services/monitor.mdx new file mode 100644 index 0000000..21ce66f --- /dev/null +++ b/services/monitor.mdx @@ -0,0 +1,213 @@ +--- +title: 'Monitor' +description: 'Scheduled web monitoring with AI-powered extraction' +icon: 'clock' +--- + +## Overview + +Monitor enables you to set up recurring web scraping jobs that automatically extract data on a schedule. Create monitors that run on a cron schedule and extract structured data from any webpage. + + +Try Monitor in our [dashboard](https://dashboard.scrapegraphai.com/) + + +## Getting Started + +### Quick Start + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +# Create a monitor +monitor = client.monitor.create( + name="Price Tracker", + url="https://example.com/products", + prompt="Extract current product prices", + cron="0 9 * * *", # Daily at 9 AM +) +print("Monitor created:", monitor) +``` + +```javascript JavaScript +import { monitorCreate } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; + +const monitor = await monitorCreate(apiKey, { + name: 'Price Tracker', + url: 'https://example.com/products', + prompt: 'Extract current product prices', + cron: '0 9 * * *', +}); + +console.log('Monitor created:', monitor); +``` + +```bash cURL +curl -X POST https://api.scrapegraphai.com/api/v2/monitor \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "name": "Price Tracker", + "url": "https://example.com/products", + "prompt": "Extract current product prices", + "cron": "0 9 * * *" + }' +``` + + + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| name | string | Yes | A descriptive name for the monitor. | +| url | string | Yes | The URL to monitor. | +| prompt | string | Yes | What data to extract on each run. | +| cron | string | Yes | Cron expression for the schedule (e.g., `"0 9 * * *"` for daily at 9 AM). | +| output_schema | object | No | Pydantic or Zod schema for structured response format. | +| fetch_config | FetchConfig | No | Configuration for page fetching (headers, stealth, etc.). | +| llm_config | LlmConfig | No | Configuration for the AI model (temperature, max_tokens, etc.). | + + +Get your API key from the [dashboard](https://dashboard.scrapegraphai.com) + + +## Managing Monitors + +### List All Monitors + +```python +monitors = client.monitor.list() +print("All monitors:", monitors) +``` + +### Get a Specific Monitor + +```python +monitor = client.monitor.get(monitor_id) +print("Monitor details:", monitor) +``` + +### Pause a Monitor + +```python +client.monitor.pause(monitor_id) +``` + +### Resume a Monitor + +```python +client.monitor.resume(monitor_id) +``` + +### Delete a Monitor + +```python +client.monitor.delete(monitor_id) +``` + +## Advanced Usage + +### With Output Schema and Config + +```python +from pydantic import BaseModel, Field +from scrapegraph_py import Client, FetchConfig, LlmConfig + +class ProductPrice(BaseModel): + name: str = Field(description="Product name") + price: float = Field(description="Current price") + in_stock: bool = Field(description="Whether the product is in stock") + +client = Client(api_key="your-api-key") + +monitor = client.monitor.create( + name="Product Price Monitor", + url="https://example.com/products", + prompt="Extract product names, prices, and stock status", + cron="0 */6 * * *", # Every 6 hours + output_schema=ProductPrice, + fetch_config=FetchConfig(stealth=True), + llm_config=LlmConfig(temperature=0.1), +) +``` + +### Async Support + +```python +import asyncio +from scrapegraph_py import AsyncClient + +async def main(): + async with AsyncClient(api_key="your-api-key") as client: + monitor = await client.monitor.create( + name="Tracker", + url="https://example.com", + prompt="Extract prices", + cron="0 9 * * *", + ) + + monitors = await client.monitor.list() + print("All monitors:", monitors) + +asyncio.run(main()) +``` + +## Key Features + + + + Run extraction jobs on any cron schedule + + + Use natural language prompts to define what to extract + + + Create, pause, resume, and delete monitors easily + + + Define structured output with Pydantic or Zod schemas + + + +## Common Cron Expressions + +| Expression | Schedule | +|-----------|----------| +| `0 9 * * *` | Daily at 9 AM | +| `0 */6 * * *` | Every 6 hours | +| `0 9 * * 1` | Every Monday at 9 AM | +| `0 0 1 * *` | First day of every month | +| `*/30 * * * *` | Every 30 minutes | + +## Integration Options + +### Official SDKs +- [Python SDK](/sdks/python) - Perfect for data science and backend applications +- [JavaScript SDK](/sdks/javascript) - Ideal for web applications and Node.js + +### AI Framework Integrations +- [LangChain Integration](/integrations/langchain) - Use Monitor in your LLM workflows + +## Support & Resources + + + + Comprehensive guides and tutorials + + + Detailed API documentation + + + Join our Discord community + + + Check out our open-source projects + + diff --git a/services/scrape.mdx b/services/scrape.mdx index f2a2373..3f9d5a3 100644 --- a/services/scrape.mdx +++ b/services/scrape.mdx @@ -1,6 +1,6 @@ --- title: 'Scrape' -description: 'Extract raw HTML content from web pages with JavaScript rendering support' +description: 'Scrape web pages in markdown, HTML, or screenshot format' icon: 'code' --- @@ -10,7 +10,7 @@ icon: 'code' ## Overview -The Scrape service provides direct access to raw HTML content from web pages, with optional JavaScript rendering support. This service is perfect for applications that need the complete HTML structure of a webpage, including dynamically generated content. +The Scrape service fetches web page content and returns it in your preferred format: markdown, HTML, screenshot, or branding. It replaces the previous Markdownify service and supports flexible output through a simple `format` parameter. Try the Scrape service instantly in our [interactive playground](https://dashboard.scrapegraphai.com/) @@ -24,25 +24,17 @@ Try the Scrape service instantly in our [interactive playground](https://dashboa ```python Python from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger -sgai_logger.set_logging(level="INFO") +client = Client(api_key="your-api-key") -# Initialize the client -sgai_client = Client(api_key="your-api-key") +# Get markdown (default) +response = client.scrape("https://example.com") -# Scrape request -response = sgai_client.htmlify( - website_url="https://example.com", - branding=True # Set to True to extract brand design and metadata -) +# Get HTML +response = client.scrape("https://example.com", format="html") -print("HTML Content:", response.html) -print("Request ID:", response.scrape_request_id) -print("Status:", response.status) -# Optional branding result -if response.branding: - print("Branding extracted") +# Get screenshot +response = client.scrape("https://example.com", format="screenshot") ``` ```javascript JavaScript @@ -50,185 +42,100 @@ import { scrape } from 'scrapegraph-js'; const apiKey = 'your-api-key'; +// Get markdown (default) const response = await scrape(apiKey, { - website_url: 'https://example.com', - branding: true, + url: 'https://example.com', }); -if (response.status === 'error') { - console.error('Error:', response.error); -} else { - console.log('HTML Content:', response.data.html); - console.log('Request ID:', response.data.scrape_request_id); - console.log('Status:', response.data.status); - if (response.data.branding) { - console.log('Branding extracted'); - } -} +// Get HTML +const htmlResponse = await scrape(apiKey, { + url: 'https://example.com', + format: 'html', +}); + +console.log(response); ``` ```bash cURL -curl -X POST https://api.scrapegraphai.com/v1/scrape \ +curl -X POST https://api.scrapegraphai.com/api/v2/scrape \ -H "Content-Type: application/json" \ - -H "SGAI-APIKEY: your-api-key" \ + -H "Authorization: Bearer your-api-key" \ -d '{ - "website_url": "https://example.com", - "branding": true + "url": "https://example.com", + "format": "markdown" }' ``` -```bash CLI -just-scrape scrape https://example.com --branding -``` - #### Parameters | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| apiKey | string | Yes | The ScrapeGraph API Key. | -| website_url | string | Yes | The URL of the webpage to scrape. | -| branding | boolean | No | Return extracted brand design and metadata. Default: false | -| stealth | boolean | No | Enable stealth mode for anti-bot protection. Adds additional credits. Default: false | -| wait_ms | integer | No | Milliseconds to wait before capturing page content. Default: 3000 | -| country_code | string | No | Two-letter ISO country code for geo-targeted proxy routing (e.g., "us", "gb", "de"). | +| url | string | Yes | The URL of the webpage to scrape. | +| format | string | No | Output format: `"markdown"` (default), `"html"`, `"screenshot"`, or `"branding"`. | +| fetch_config | FetchConfig | No | Configuration for page fetching (headers, stealth, etc.). | Get your API key from the [dashboard](https://dashboard.scrapegraphai.com) - + ```json { - "scrape_request_id": "2f0f7a7e-7eb3-4bd2-8f8d-ae8a7f2d9c1a", - "status": "completed", - "html": "Example Page

Welcome to Example.com

This is the raw HTML content...

", - "error": "" -} -``` - -The response includes: -- `scrape_request_id`: Unique identifier for tracking your request -- `status`: Current status of the scraping operation -- `html`: Raw HTML content of the webpage -- `error`: Error message (if any occurred during scraping) -
- - -```json -{ - "scrape_request_id": "2f0f7a7e-7eb3-4bd2-8f8d-ae8a7f2d9c1a", - "status": "completed", - "html": "...", - "error": "", - "branding": { - "branding": { - "colorScheme": "light", - "colors": { - "primary": "#0B5FFF", - "accent": "#FF8A00", - "background": "#FFFFFF", - "textPrimary": "#111827", - "link": "#0B5FFF" - }, - "fonts": [ - { "family": "Inter", "role": "body" } - ], - "typography": { - "fontFamilies": { "primary": "Inter", "heading": "Inter" }, - "fontStacks": { "heading": ["Inter"], "body": ["Inter"] }, - "fontSizes": { "h1": "32px", "h2": "24px", "body": "16px" } - }, - "spacing": { "baseUnit": 4, "borderRadius": "6px" }, - "components": { - "input": { "borderColor": "#E5E7EB", "borderRadius": "6px" }, - "buttonPrimary": { - "background": "#0B5FFF", - "textColor": "#FFFFFF", - "borderRadius": "6px", - "shadow": "..." - } - }, - "images": { - "logo": "https://example.com/logo.svg", - "favicon": "https://example.com/favicon.ico", - "ogImage": "https://example.com/og.png" - }, - "designSystem": { "framework": "tailwind", "componentLibrary": null }, - "confidence": { "overall": 0.86 } - }, - "metadata": { - "title": "Example", - "language": "en", - "favicon": "https://example.com/favicon.ico" - } + "id": "0d6c4b31-931b-469b-9a7f-2f1e002e79ca", + "format": "markdown", + "content": [ + "# Example Domain\n\nThis domain is for use in documentation examples..." + ], + "metadata": { + "contentType": "text/html" } } ``` - -When `branding=true` is passed, the response includes a `branding` object with brand design data and page metadata. -## Key Features +## Output Formats - - - Get complete HTML structure including all elements - - - Optionally extract brand colors, fonts, typography, UI components, images, and metadata - - - Quick extraction for simple HTML content - - - Consistent results across different websites - - +| Format | Description | +|--------|-------------| +| `markdown` | Clean markdown conversion of the page content (default). | +| `html` | Raw HTML content of the page. | +| `screenshot` | Screenshot image of the rendered page. | +| `branding` | Brand design extraction: colors, fonts, typography, logos. | -## Use Cases - -### Web Development -- Extract HTML templates -- Analyze page structure -- Test website rendering -- Debug HTML issues - -### Data Analysis -- Parse HTML content -- Extract specific elements -- Monitor website changes -- Build web scrapers - -### Content Processing -- Process dynamic content -- Handle JavaScript-heavy sites -- Extract embedded data -- Analyze page performance +## Advanced Usage - -Want to learn more about our AI-powered scraping technology? Visit our [main website](https://scrapegraphai.com) to discover how we're revolutionizing web data extraction. - +### With FetchConfig -## Advanced Usage +```python +from scrapegraph_py import Client, FetchConfig + +client = Client(api_key="your-api-key") + +response = client.scrape( + "https://example.com", + format="markdown", + fetch_config=FetchConfig( + render_js=True, + stealth=True, + wait_ms=2000, + headers={"User-Agent": "MyBot"}, + ), +) +``` ### Async Support -For applications requiring asynchronous execution, the Scrape service provides async support: - ```python -from scrapegraph_py import AsyncClient import asyncio +from scrapegraph_py import AsyncClient async def main(): async with AsyncClient(api_key="your-api-key") as client: - response = await client.htmlify( - website_url="https://example.com" - ) + response = await client.scrape("https://example.com") print(response) -# Run the async function asyncio.run(main()) ``` @@ -239,41 +146,44 @@ Process multiple URLs concurrently for better performance: ```python import asyncio from scrapegraph_py import AsyncClient -from scrapegraph_py.logger import sgai_logger - -sgai_logger.set_logging(level="INFO") async def main(): - # Initialize async client - sgai_client = AsyncClient(api_key="your-api-key") - - # URLs to scrape - urls = [ - "https://example.com", - "https://scrapegraphai.com/", - "https://github.com/ScrapeGraphAI/Scrapegraph-ai", - ] - - tasks = [sgai_client.htmlify(website_url=url) for url in urls] - - # Execute requests concurrently - responses = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for i, response in enumerate(responses): - if isinstance(response, Exception): - print(f"\nError for {urls[i]}: {response}") - else: - print(f"\nPage {i+1} HTML:") - print(f"URL: {urls[i]}") - print(f"HTML Length: {len(response['html'])} characters") - - await sgai_client.close() - -if __name__ == "__main__": - asyncio.run(main()) + async with AsyncClient(api_key="your-api-key") as client: + urls = [ + "https://example.com", + "https://scrapegraphai.com/", + "https://github.com/ScrapeGraphAI/Scrapegraph-ai", + ] + + tasks = [client.scrape(url) for url in urls] + responses = await asyncio.gather(*tasks, return_exceptions=True) + + for i, response in enumerate(responses): + if isinstance(response, Exception): + print(f"Error for {urls[i]}: {response}") + else: + print(f"Page {i+1}: {len(str(response))} characters") + +asyncio.run(main()) ``` +## Key Features + + + + Get content as markdown, HTML, screenshot, or branding data + + + Handle JavaScript-heavy sites with render_js support + + + Quick extraction for simple content + + + Consistent results across different websites + + + ## Integration Options ### Official SDKs @@ -284,37 +194,6 @@ if __name__ == "__main__": - [LangChain Integration](/integrations/langchain) - Use Scrape in your content pipelines - [LlamaIndex Integration](/integrations/llamaindex) - Create searchable knowledge bases -## Best Practices - -### Performance Optimization -1. Process multiple URLs concurrently -3. Cache results when possible -4. Monitor API usage and costs - -### Error Handling -- Always check the `status` field -- Handle network timeouts gracefully -- Implement retry logic for failed requests -- Log errors for debugging - -### Content Processing -- Validate HTML structure before parsing -- Handle different character encodings -- Extract only needed content sections -- Clean up HTML for further processing - -## Example Projects - -Check out our [cookbook](/cookbook/introduction) for real-world examples: -- Web scraping automation tools -- Content monitoring systems -- HTML analysis applications -- Dynamic content extractors - -## API Reference - -For detailed API documentation, see the [API Reference](/api-reference/introduction). - ## Support & Resources @@ -330,9 +209,6 @@ For detailed API documentation, see the [API Reference](/api-reference/introduct Check out our open-source projects - - Visit our official website - diff --git a/services/search.mdx b/services/search.mdx new file mode 100644 index 0000000..b060710 --- /dev/null +++ b/services/search.mdx @@ -0,0 +1,167 @@ +--- +title: 'Search' +description: 'AI-powered web search with structured data extraction' +icon: 'magnifying-glass' +--- + +## Overview + +Search enables you to search the web and extract structured results using AI. It combines web search capabilities with intelligent data extraction, returning clean, structured data from search results. + + +Try Search instantly in our [interactive playground](https://dashboard.scrapegraphai.com/) + + +## Getting Started + +### Quick Start + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.search( + query="What are the key features of ChatGPT Plus?" +) +``` + +```javascript JavaScript +import { search } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; + +const response = await search(apiKey, { + query: 'What are the key features of ChatGPT Plus?', +}); + +console.log(response); +``` + +```bash cURL +curl -X POST https://api.scrapegraphai.com/api/v2/search \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "query": "What are the key features of ChatGPT Plus?" + }' +``` + + + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| query | string | Yes | The search query to execute. | +| num_results | int | No | Number of search results to return. | +| output_schema | object | No | Pydantic or Zod schema for structured response format. | + + +Get your API key from the [dashboard](https://dashboard.scrapegraphai.com) + + +## Custom Schema Example + +Define the structure of the output using Pydantic or Zod: + + + +```python Python +from pydantic import BaseModel, Field +from scrapegraph_py import Client + +class SearchResult(BaseModel): + title: str = Field(description="The result title") + summary: str = Field(description="Brief summary of the result") + url: str = Field(description="Source URL") + +client = Client(api_key="your-api-key") + +response = client.search( + query="Latest AI developments 2024", + num_results=10, + output_schema=SearchResult, +) +``` + +```javascript JavaScript +import { search } from 'scrapegraph-js'; +import { z } from 'zod'; + +const schema = z.object({ + title: z.string().describe('The result title'), + summary: z.string().describe('Brief summary'), + url: z.string().describe('Source URL'), +}); + +const response = await search('your-api-key', { + query: 'Latest AI developments 2024', + num_results: 10, + output_schema: schema, +}); +``` + + + +## Async Support + +```python +import asyncio +from scrapegraph_py import AsyncClient + +async def main(): + async with AsyncClient(api_key="your-api-key") as client: + response = await client.search( + query="Best practices for web scraping" + ) + print(response) + +asyncio.run(main()) +``` + +## Key Features + + + + Intelligent extraction from search results + + + Returns clean, structured data in your preferred format + + + Define custom output schemas using Pydantic or Zod + + + Control the number of search results returned + + + +## Integration Options + +### Official SDKs +- [Python SDK](/sdks/python) - Perfect for data science and backend applications +- [JavaScript SDK](/sdks/javascript) - Ideal for web applications and Node.js + +### AI Framework Integrations +- [LangChain Integration](/integrations/langchain) - Use Search in your LLM workflows +- [LlamaIndex Integration](/integrations/llamaindex) - Build powerful search and QA systems + +## Support & Resources + + + + Comprehensive guides and tutorials + + + Detailed API documentation + + + Join our Discord community + + + Check out our open-source projects + + diff --git a/v1/additional-parameters/headers.mdx b/v1/additional-parameters/headers.mdx new file mode 100644 index 0000000..b202338 --- /dev/null +++ b/v1/additional-parameters/headers.mdx @@ -0,0 +1,23 @@ +--- +title: 'Custom Headers' +description: 'Pass custom HTTP headers with your requests (v1)' +icon: 'heading' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, use `FetchConfig(headers={...})`. See the [v2 documentation](/services/additional-parameters/headers). + + +## Custom Headers + +Pass custom HTTP headers with your scraping requests: + +```python +response = client.smartscraper( + website_url="https://example.com", + user_prompt="Extract data", + headers={"Authorization": "Bearer token", "Accept-Language": "en-US"} +) +``` + +For v2 usage with `FetchConfig`, see the [v2 documentation](/services/additional-parameters/headers). diff --git a/v1/additional-parameters/pagination.mdx b/v1/additional-parameters/pagination.mdx new file mode 100644 index 0000000..a32d8dd --- /dev/null +++ b/v1/additional-parameters/pagination.mdx @@ -0,0 +1,15 @@ +--- +title: 'Pagination' +description: 'Handle paginated content (v1)' +icon: 'arrow-right' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 documentation](/services/additional-parameters/pagination). + + +## Pagination + +Handle paginated content using the `number_of_scrolls` parameter or by specifying pagination logic in your prompt. + +For v2 usage, see the [v2 documentation](/services/additional-parameters/pagination). diff --git a/v1/additional-parameters/proxy.mdx b/v1/additional-parameters/proxy.mdx new file mode 100644 index 0000000..57744d9 --- /dev/null +++ b/v1/additional-parameters/proxy.mdx @@ -0,0 +1,23 @@ +--- +title: 'Proxy' +description: 'Route requests through specific countries (v1)' +icon: 'shield' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, use `FetchConfig(country="us")`. See the [v2 documentation](/services/additional-parameters/proxy). + + +## Proxy Routing + +Route scraping requests through proxies in specific countries using the `country_code` parameter: + +```python +response = client.smartscraper( + website_url="https://example.com", + user_prompt="Extract data", + country_code="us" +) +``` + +For v2 usage with `FetchConfig`, see the [v2 documentation](/services/additional-parameters/proxy). diff --git a/v1/additional-parameters/wait-ms.mdx b/v1/additional-parameters/wait-ms.mdx new file mode 100644 index 0000000..93fbbe6 --- /dev/null +++ b/v1/additional-parameters/wait-ms.mdx @@ -0,0 +1,23 @@ +--- +title: 'Wait Time' +description: 'Configure page load wait time (v1)' +icon: 'clock' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, use `FetchConfig(wait_ms=3000)`. See the [v2 documentation](/services/additional-parameters/wait-ms). + + +## Wait Time + +Configure how long to wait for the page to load before scraping: + +```python +response = client.smartscraper( + website_url="https://example.com", + user_prompt="Extract data", + wait_ms=5000 # Wait 5 seconds +) +``` + +For v2 usage with `FetchConfig`, see the [v2 documentation](/services/additional-parameters/wait-ms). diff --git a/v1/agenticscraper.mdx b/v1/agenticscraper.mdx new file mode 100644 index 0000000..9b1ee49 --- /dev/null +++ b/v1/agenticscraper.mdx @@ -0,0 +1,39 @@ +--- +title: 'AgenticScraper' +description: 'Agent-based multi-step scraping (v1)' +icon: 'robot' +--- + + +You are viewing the **v1 (legacy)** documentation. AgenticScraper has been removed in v2. Use `extract()` with `FetchConfig` for advanced scraping, or `crawl.start()` for multi-page extraction. See the [v2 documentation](/services/agenticscraper). + + +## Overview + +AgenticScraper uses an AI agent to perform multi-step scraping operations, navigating through pages and interacting with elements as needed. + +## Usage + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.agenticscraper( + website_url="https://example.com", + user_prompt="Navigate to the pricing page and extract all plan details" +) +``` + +```javascript JavaScript +import { agenticScraper } from "scrapegraph-js"; + +const response = await agenticScraper(apiKey, { + website_url: "https://example.com", + user_prompt: "Navigate to the pricing page and extract all plan details", +}); +``` + + diff --git a/v1/api-reference/introduction.mdx b/v1/api-reference/introduction.mdx new file mode 100644 index 0000000..660053e --- /dev/null +++ b/v1/api-reference/introduction.mdx @@ -0,0 +1,51 @@ +--- +title: 'API Reference' +description: 'ScrapeGraphAI v1 API Reference' +icon: 'book' +--- + + +You are viewing the **v1 (legacy)** API documentation. The v1 API uses `/v1/*` endpoints. Please migrate to the [v2 API](/api-reference/introduction) which uses `/api/v2/*` endpoints. + + +## Base URL + +``` +https://api.scrapegraphai.com/v1 +``` + +## Authentication + +All v1 API requests require the `SGAI-APIKEY` header: + +```bash +curl -X POST "https://api.scrapegraphai.com/v1/smartscraper" \ + -H "SGAI-APIKEY: your-api-key" \ + -H "Content-Type: application/json" \ + -d '{"website_url": "https://example.com", "user_prompt": "Extract data"}' +``` + + +In v2, authentication uses the `Authorization: Bearer` header instead. + + +## v1 Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/v1/smartscraper` | POST | Start a SmartScraper job | +| `/v1/smartscraper/{id}` | GET | Get SmartScraper job status | +| `/v1/searchscraper` | POST | Start a SearchScraper job | +| `/v1/searchscraper/{id}` | GET | Get SearchScraper job status | +| `/v1/markdownify` | POST | Start a Markdownify job | +| `/v1/markdownify/{id}` | GET | Get Markdownify job status | +| `/v1/smartcrawler` | POST | Start a SmartCrawler job | +| `/v1/smartcrawler/{id}` | GET | Get SmartCrawler job status | +| `/v1/sitemap` | POST | Start a Sitemap job | +| `/v1/sitemap/{id}` | GET | Get Sitemap job status | +| `/v1/credits` | GET | Get remaining credits | +| `/v1/feedback` | POST | Submit feedback | + +## Migration to v2 + +See the [v2 API Reference](/api-reference/introduction) for the latest endpoints and authentication methods. diff --git a/v1/cli/ai-agent-skill.mdx b/v1/cli/ai-agent-skill.mdx new file mode 100644 index 0000000..eea9b7e --- /dev/null +++ b/v1/cli/ai-agent-skill.mdx @@ -0,0 +1,15 @@ +--- +title: 'AI Agent Skill' +description: 'Use CLI as an AI agent skill (v1)' +icon: 'robot' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 AI Agent Skill documentation](/services/cli/ai-agent-skill). + + +## Overview + +The ScrapeGraphAI CLI can be used as a skill within AI agent frameworks, enabling agents to scrape and extract web data. + +For detailed usage, see the [v2 documentation](/services/cli/ai-agent-skill). diff --git a/v1/cli/commands.mdx b/v1/cli/commands.mdx new file mode 100644 index 0000000..6eddc1a --- /dev/null +++ b/v1/cli/commands.mdx @@ -0,0 +1,20 @@ +--- +title: 'CLI Commands' +description: 'Available CLI commands (v1)' +icon: 'terminal' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 CLI commands](/services/cli/commands). + + +## Available Commands + +| Command | Description | +|---------|-------------| +| `sgai smartscraper` | Extract data from a webpage using AI | +| `sgai searchscraper` | Search and extract from multiple sources | +| `sgai markdownify` | Convert webpage to markdown | +| `sgai credits` | Check remaining API credits | + +For detailed usage, see the [v2 CLI documentation](/services/cli/commands). diff --git a/v1/cli/examples.mdx b/v1/cli/examples.mdx new file mode 100644 index 0000000..af5365f --- /dev/null +++ b/v1/cli/examples.mdx @@ -0,0 +1,31 @@ +--- +title: 'CLI Examples' +description: 'CLI usage examples (v1)' +icon: 'play' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 CLI examples](/services/cli/examples). + + +## Examples + +### Extract company info + +```bash +sgai smartscraper --url "https://example.com/about" --prompt "Extract the company name and description" +``` + +### Search the web + +```bash +sgai searchscraper --prompt "Latest AI news" --num-results 5 +``` + +### Convert to markdown + +```bash +sgai markdownify --url "https://example.com/article" +``` + +For more examples, see the [v2 CLI documentation](/services/cli/examples). diff --git a/v1/cli/introduction.mdx b/v1/cli/introduction.mdx new file mode 100644 index 0000000..161dace --- /dev/null +++ b/v1/cli/introduction.mdx @@ -0,0 +1,27 @@ +--- +title: 'CLI Introduction' +description: 'ScrapeGraphAI Command Line Interface (v1)' +icon: 'terminal' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 CLI documentation](/services/cli/introduction). + + +## Overview + +The ScrapeGraphAI CLI provides a command-line interface for interacting with ScrapeGraphAI services directly from your terminal. + +## Installation + +```bash +pip install scrapegraph-py +``` + +## Quick Start + +```bash +sgai smartscraper --url "https://example.com" --prompt "Extract the title" +``` + +For more details, see the [v2 CLI documentation](/services/cli/introduction). diff --git a/v1/cli/json-mode.mdx b/v1/cli/json-mode.mdx new file mode 100644 index 0000000..932ca48 --- /dev/null +++ b/v1/cli/json-mode.mdx @@ -0,0 +1,17 @@ +--- +title: 'JSON Mode' +description: 'CLI JSON output mode (v1)' +icon: 'code' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 JSON mode documentation](/services/cli/json-mode). + + +## JSON Output + +Use the `--json` flag to get structured JSON output from CLI commands: + +```bash +sgai smartscraper --url "https://example.com" --prompt "Extract data" --json +``` diff --git a/v1/introduction.mdx b/v1/introduction.mdx new file mode 100644 index 0000000..23eaab7 --- /dev/null +++ b/v1/introduction.mdx @@ -0,0 +1,88 @@ +--- +title: Introduction +description: 'Welcome to ScrapeGraphAI v1 - AI-Powered Web Data Extraction' +--- + + +You are viewing the **v1 (legacy)** documentation. v1 is deprecated and will be removed in a future release. Please migrate to [v2](/introduction) for the latest features and improvements. + + + + +## Overview + +[ScrapeGraphAI](https://scrapegraphai.com) is a powerful suite of LLM-driven web scraping tools designed to extract structured data from any website and HTML content. Our API is designed to be easy to use and integrate with your existing workflows. + +### Perfect For + + + + Feed your AI agents with structured web data for enhanced decision-making + + + Extract and structure web data for research and analysis + + + Build comprehensive datasets from web sources + + + Create scraping-powered platforms and applications + + + +## Getting Started + + + + Sign up and access your API key from the [dashboard](https://dashboard.scrapegraphai.com) + + + Select from our specialized extraction services based on your needs + + + Begin extracting data using our SDKs or direct API calls + + + +## Core Services + +- **SmartScraper**: AI-powered extraction for any webpage +- **SearchScraper**: Find and extract any data using AI starting from a prompt +- **SmartCrawler**: AI-powered extraction for any webpage with crawl +- **Markdownify**: Convert web content to clean Markdown format +- **Sitemap**: Extract sitemaps from websites +- **AgenticScraper**: Agent-based multi-step scraping +- **Toonify**: Convert images to cartoon style + +## v1 SDKs + +### Python +```python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.smartscraper( + website_url="https://example.com", + user_prompt="Extract the main content" +) +``` + +### JavaScript +```javascript +import { smartScraper } from "scrapegraph-js"; + +const response = await smartScraper(apiKey, { + website_url: "https://example.com", + user_prompt: "What does the company do?", +}); +``` + +## Migrate to v2 + +v2 brings significant improvements including renamed methods, unified configuration objects, and new endpoints. See the migration guides: +- [Python SDK Migration Guide](https://github.com/ScrapeGraphAI/scrapegraph-py/blob/main/MIGRATION_V2.md) +- [JavaScript SDK Migration Guide](https://github.com/ScrapeGraphAI/scrapegraph-js/blob/main/MIGRATION.md) diff --git a/v1/markdownify.mdx b/v1/markdownify.mdx new file mode 100644 index 0000000..2a4028e --- /dev/null +++ b/v1/markdownify.mdx @@ -0,0 +1,46 @@ +--- +title: 'Markdownify' +description: 'Convert web content to clean markdown (v1)' +icon: 'markdown' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, Markdownify has been replaced by `scrape()` with `output_format="markdown"`. See the [v2 documentation](/services/markdownify). + + +## Overview + +Markdownify converts any webpage into clean, formatted markdown. + +## Usage + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.markdownify( + website_url="https://example.com" +) +``` + +```javascript JavaScript +import { markdownify } from "scrapegraph-js"; + +const response = await markdownify(apiKey, { + website_url: "https://example.com", +}); +``` + + + +### Parameters + +| Parameter | Type | Required | Description | +| ------------ | ------- | -------- | ---------------------------------------- | +| website_url | string | Yes | The URL of the webpage to convert | +| wait_ms | number | No | Page load wait time in ms | +| stealth | boolean | No | Enable anti-detection mode | +| country_code | string | No | Proxy routing country code | diff --git a/v1/mcp-server/claude.mdx b/v1/mcp-server/claude.mdx new file mode 100644 index 0000000..d03af08 --- /dev/null +++ b/v1/mcp-server/claude.mdx @@ -0,0 +1,11 @@ +--- +title: 'Claude Integration' +description: 'Use ScrapeGraphAI MCP with Claude (v1)' +icon: 'message-bot' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 Claude integration](/services/mcp-server/claude). + + +For Claude MCP setup, see the [v2 documentation](/services/mcp-server/claude). diff --git a/v1/mcp-server/cursor.mdx b/v1/mcp-server/cursor.mdx new file mode 100644 index 0000000..dbcdfa4 --- /dev/null +++ b/v1/mcp-server/cursor.mdx @@ -0,0 +1,11 @@ +--- +title: 'Cursor Integration' +description: 'Use ScrapeGraphAI MCP with Cursor (v1)' +icon: 'code' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 Cursor integration](/services/mcp-server/cursor). + + +For Cursor MCP setup, see the [v2 documentation](/services/mcp-server/cursor). diff --git a/v1/mcp-server/introduction.mdx b/v1/mcp-server/introduction.mdx new file mode 100644 index 0000000..3162ccf --- /dev/null +++ b/v1/mcp-server/introduction.mdx @@ -0,0 +1,15 @@ +--- +title: 'MCP Server Introduction' +description: 'ScrapeGraphAI MCP Server (v1)' +icon: 'server' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 MCP Server documentation](/services/mcp-server/introduction). + + +## Overview + +The ScrapeGraphAI MCP (Model Context Protocol) Server enables AI assistants and tools to use ScrapeGraphAI as a data source. + +For setup and usage, see the [v2 MCP Server documentation](/services/mcp-server/introduction). diff --git a/v1/mcp-server/smithery.mdx b/v1/mcp-server/smithery.mdx new file mode 100644 index 0000000..edee161 --- /dev/null +++ b/v1/mcp-server/smithery.mdx @@ -0,0 +1,11 @@ +--- +title: 'Smithery Integration' +description: 'Use ScrapeGraphAI MCP with Smithery (v1)' +icon: 'hammer' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 Smithery integration](/services/mcp-server/smithery). + + +For Smithery MCP setup, see the [v2 documentation](/services/mcp-server/smithery). diff --git a/v1/quickstart.mdx b/v1/quickstart.mdx new file mode 100644 index 0000000..34fe5d8 --- /dev/null +++ b/v1/quickstart.mdx @@ -0,0 +1,69 @@ +--- +title: Quickstart +description: 'Get started with ScrapeGraphAI v1 SDKs' +--- + + +You are viewing the **v1 (legacy)** documentation. Please migrate to [v2](/install) for the latest features. + + +## Prerequisites + +- Obtain your **API key** by signing up on the [ScrapeGraphAI Dashboard](https://dashboard.scrapegraphai.com) + +--- + +## Python SDK + +```bash +pip install scrapegraph-py +``` + +```python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key-here") + +response = client.smartscraper( + website_url="https://scrapegraphai.com", + user_prompt="Extract information about the company" +) +print(response) +``` + + +You can also set the `SGAI_API_KEY` environment variable and initialize the client without parameters: `client = Client()` + + +--- + +## JavaScript SDK + +```bash +npm i scrapegraph-js +``` + +```javascript +import { smartScraper } from "scrapegraph-js"; + +const apiKey = "your-api-key-here"; + +const response = await smartScraper(apiKey, { + website_url: "https://scrapegraphai.com", + user_prompt: "What does the company do?", +}); + +if (response.status === "error") { + console.error("Error:", response.error); +} else { + console.log(response.data.result); +} +``` + +--- + +## Next Steps + +- Explore the [SmartScraper](/v1/smartscraper) service +- Check out [SearchScraper](/v1/searchscraper) for search-based extraction +- Use [Markdownify](/v1/markdownify) for HTML-to-markdown conversion diff --git a/v1/scrape.mdx b/v1/scrape.mdx new file mode 100644 index 0000000..5d6ba07 --- /dev/null +++ b/v1/scrape.mdx @@ -0,0 +1,37 @@ +--- +title: 'Scrape' +description: 'Basic webpage scraping service (v1)' +icon: 'spider-web' +--- + + +You are viewing the **v1 (legacy)** documentation. See the [v2 documentation](/services/scrape). + + +## Overview + +The Scrape service provides basic webpage scraping capabilities, returning the raw content of a webpage. + +## Usage + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.scrape( + website_url="https://example.com" +) +``` + +```javascript JavaScript +import { scrape } from "scrapegraph-js"; + +const response = await scrape(apiKey, { + website_url: "https://example.com", +}); +``` + + diff --git a/v1/searchscraper.mdx b/v1/searchscraper.mdx new file mode 100644 index 0000000..eba7042 --- /dev/null +++ b/v1/searchscraper.mdx @@ -0,0 +1,52 @@ +--- +title: 'SearchScraper' +description: 'Search and extract information from multiple web sources (v1)' +icon: 'magnifying-glass' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, SearchScraper has been renamed to `search()`. See the [v2 documentation](/services/searchscraper). + + +## Overview + +SearchScraper enables you to search the web and extract structured information from multiple sources using AI. + +## Usage + + + +```python Python +from scrapegraph_py import Client +from scrapegraph_py.models import TimeRange + +client = Client(api_key="your-api-key") + +response = client.searchscraper( + user_prompt="What are the key features of ChatGPT Plus?", + time_range=TimeRange.PAST_WEEK +) +``` + +```javascript JavaScript +import { searchScraper } from "scrapegraph-js"; + +const response = await searchScraper(apiKey, { + user_prompt: "Find the best restaurants in San Francisco", + location_geo_code: "us", + time_range: "past_week", +}); +``` + + + +### Parameters + +| Parameter | Type | Required | Description | +| ----------------- | --------- | -------- | -------------------------------------------------------- | +| user_prompt | string | Yes | Search query description | +| num_results | number | No | Number of websites to search (3-20) | +| extraction_mode | boolean | No | AI extraction (true) or markdown mode (false) | +| output_schema | object | No | Schema for structured response | +| location_geo_code | string | No | Geo code for location-based search | +| time_range | TimeRange | No | Time range filter for results | diff --git a/v1/sitemap.mdx b/v1/sitemap.mdx new file mode 100644 index 0000000..284f64f --- /dev/null +++ b/v1/sitemap.mdx @@ -0,0 +1,37 @@ +--- +title: 'Sitemap' +description: 'Extract sitemaps from websites (v1)' +icon: 'sitemap' +--- + + +You are viewing the **v1 (legacy)** documentation. The Sitemap endpoint has been removed in v2. Use `crawl.start()` with URL patterns instead. See the [v2 documentation](/services/sitemap). + + +## Overview + +The Sitemap service extracts and parses sitemap data from any website. + +## Usage + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.sitemap( + website_url="https://example.com" +) +``` + +```javascript JavaScript +import { sitemap } from "scrapegraph-js"; + +const response = await sitemap(apiKey, { + website_url: "https://example.com", +}); +``` + + diff --git a/v1/smartcrawler.mdx b/v1/smartcrawler.mdx new file mode 100644 index 0000000..6ceb27f --- /dev/null +++ b/v1/smartcrawler.mdx @@ -0,0 +1,41 @@ +--- +title: 'SmartCrawler' +description: 'AI-powered multi-page crawling service (v1)' +icon: 'spider' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, crawling uses `crawl.start()`, `crawl.status()`, `crawl.stop()`, and `crawl.resume()`. See the [v2 documentation](/services/smartcrawler). + + +## Overview + +SmartCrawler enables AI-powered extraction across multiple pages of a website, automatically navigating and collecting structured data. + +## Usage + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.crawl( + website_url="https://example.com", + user_prompt="Extract all blog post titles", + depth=2 +) +``` + +```javascript JavaScript +import { smartCrawler } from "scrapegraph-js"; + +const response = await smartCrawler(apiKey, { + website_url: "https://example.com", + user_prompt: "Extract all blog post titles", + depth: 2, +}); +``` + + diff --git a/v1/smartscraper.mdx b/v1/smartscraper.mdx new file mode 100644 index 0000000..cc082cc --- /dev/null +++ b/v1/smartscraper.mdx @@ -0,0 +1,52 @@ +--- +title: 'SmartScraper' +description: 'AI-powered web scraping for any website (v1)' +icon: 'robot' +--- + + +You are viewing the **v1 (legacy)** documentation. In v2, SmartScraper has been renamed to `extract()`. See the [v2 documentation](/services/smartscraper). + + +## Overview + +SmartScraper is our flagship LLM-powered web scraping service that intelligently extracts structured data from any website. + +## Usage + + + +```python Python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.smartscraper( + website_url="https://example.com", + user_prompt="Extract the main heading and description" +) +``` + +```javascript JavaScript +import { smartScraper } from "scrapegraph-js"; + +const response = await smartScraper(apiKey, { + website_url: "https://example.com", + user_prompt: "Extract the main content", +}); +``` + + + +### Parameters + +| Parameter | Type | Required | Description | +| ------------- | ------- | -------- | --------------------------------------------------------------------------- | +| website_url | string | Yes | The URL of the webpage to scrape | +| user_prompt | string | Yes | A textual description of what you want to extract | +| output_schema | object | No | Pydantic/Zod schema for structured response | +| stealth | boolean | No | Enable anti-detection mode | +| headers | object | No | Custom HTTP headers | +| mock | boolean | No | Enable mock mode for testing | +| wait_ms | number | No | Page load wait time in ms | +| country_code | string | No | Proxy routing country code | diff --git a/v1/toonify.mdx b/v1/toonify.mdx new file mode 100644 index 0000000..ab5293e --- /dev/null +++ b/v1/toonify.mdx @@ -0,0 +1,25 @@ +--- +title: 'Toonify' +description: 'Convert images to cartoon style (v1)' +icon: 'palette' +--- + + +You are viewing the **v1 (legacy)** documentation. Toonify has been removed in v2. See the [v2 documentation](/services/toonify). + + +## Overview + +Toonify converts images into cartoon-style illustrations using AI. + +## Usage + +```python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key") + +response = client.toonify( + website_url="https://example.com/image.jpg" +) +```