This document provides comprehensive documentation for using APIAS as a Python module in your own applications.
pip install apias
# or
uv add apiasfrom apias.config import APIASConfig
from apias.apias import Scraper, clean_html, validate_xml
# Create a scraper and fetch a page
scraper = Scraper(quiet=True)
content, mime_type = scraper.scrape("https://api.example.com/docs")
if content:
# Clean the HTML
cleaned = clean_html(content)
print(f"Cleaned HTML: {len(cleaned)} characters")APIAS exports key functions at the package level for convenience:
import apias
# Access version
print(apias.__version__) # e.g., "0.1.26"
# Parse documentation content
doc = apias.parse_documentation("GET /api/users - List users")
# Validate configuration dictionary
is_valid = apias.validate_config({
"base_url": "https://example.com",
"output_format": "xml"
})| Name | Type | Description |
|---|---|---|
__version__ |
str |
Package version string |
parse_documentation |
function |
Parse raw documentation into APIDocument |
validate_config |
function |
Validate configuration dictionary |
from apias.config import (
DEFAULT_MODEL, # Default OpenAI model: "gpt-5-nano"
SUPPORTED_MODELS, # Dict of supported models with context windows
MODEL_PRICING, # Dict of model pricing per 1M tokens
COST_RATIO_CONSERVATIVE, # P50 output/input ratio: 0.58
COST_RATIO_AVERAGE, # Mean output/input ratio: 1.84
COST_RATIO_WORST_CASE, # P95 output/input ratio: 11.82
CHARS_PER_TOKEN, # Characters per token estimate: 4
)Example: List supported models
from apias.config import SUPPORTED_MODELS, DEFAULT_MODEL
print(f"Default model: {DEFAULT_MODEL}")
print("\nSupported models:")
for model, info in SUPPORTED_MODELS.items():
print(f" {model}: {info['context_window']:,} tokens - {info['description']}")Main configuration class for APIAS. Supports loading from YAML/JSON files or direct instantiation.
| Attribute | Type | Default | Description |
|---|---|---|---|
model |
str |
"gpt-5-nano" |
OpenAI model to use |
api_key |
str | None |
None |
API key (uses OPENAI_API_KEY env var if None) |
max_tokens |
int |
4096 |
Maximum tokens in response |
temperature |
float |
0.1 |
Model temperature (0-2) |
num_threads |
int |
5 |
Number of concurrent workers |
chunk_size |
int |
50000 |
Max characters per HTML chunk |
max_retries |
int |
3 |
Retry attempts on failure |
retry_delay |
float |
1.0 |
Base delay between retries (seconds) |
output_folder |
str | None |
None |
Output directory (auto-generated if None) |
output_format |
str |
"xml" |
Output format |
whitelist_patterns |
List[str] |
[] |
URL patterns to include |
blacklist_patterns |
List[str] |
[] |
URL patterns to exclude |
limit |
int | None |
None |
Max URLs to process |
no_tui |
bool |
False |
Disable Rich TUI |
quiet |
bool |
False |
Minimal output (implies no_tui) |
auto_resume |
bool |
False |
Auto-resume last session |
progress_file |
str |
"progress.json" |
Progress tracking file name |
atomic_saves |
bool |
True |
Use atomic file writes for progress |
Create a new configuration with custom values.
from apias.config import APIASConfig
# Basic configuration
config = APIASConfig()
# Custom configuration
config = APIASConfig(
model="gpt-5-mini",
num_threads=10,
max_retries=5,
quiet=True
)Get the OpenAI API key from config or environment variable.
from apias.config import APIASConfig
config = APIASConfig()
try:
api_key = config.get_api_key()
print(f"API key found: {api_key[:8]}...")
except ValueError as e:
print(f"Error: {e}")Convert configuration to a dictionary.
from apias.config import APIASConfig
config = APIASConfig(model="gpt-5-mini", num_threads=8)
config_dict = config.to_dict()
print(config_dict)
# {'model': 'gpt-5-mini', 'num_threads': 8, ...}Create configuration from a dictionary.
from apias.config import APIASConfig
data = {
"model": "gpt-5-nano",
"num_threads": 5,
"quiet": True
}
config = APIASConfig.from_dict(data)
print(config.model) # gpt-5-nanoLoad configuration from a YAML file.
from apias.config import APIASConfig
# config.yaml:
# model: gpt-5-mini
# num_threads: 8
# quiet: true
config = APIASConfig.from_yaml("config.yaml")
print(config.model) # gpt-5-miniLoad configuration from a JSON file.
from apias.config import APIASConfig
# config.json:
# {"model": "gpt-5-mini", "num_threads": 8}
config = APIASConfig.from_json("config.json")
print(config.num_threads) # 8Save configuration to a YAML file.
from apias.config import APIASConfig
config = APIASConfig(model="gpt-5-nano", num_threads=5)
config.save_yaml("my_config.yaml")Save configuration to a JSON file.
from apias.config import APIASConfig
config = APIASConfig(model="gpt-5-nano", num_threads=5)
config.save_json("my_config.json")Load configuration with precedence: CLI args > config file > defaults.
from apias.config import load_config
# Load defaults only
config = load_config()
# Load from file
config = load_config(config_path="apias_config.yaml")
# Load from file with CLI overrides
config = load_config(
config_path="apias_config.yaml",
cli_overrides={"model": "gpt-5-mini", "quiet": True}
)Generate an example YAML configuration file with explanatory comments.
from apias.config import generate_example_config
# Generate with default name
generate_example_config() # Creates "apias_config.yaml"
# Generate with custom path
generate_example_config("my_config.yaml")The generated file includes all available options with documentation:
# APIAS Configuration File
# ========================
# model: The OpenAI model to use
model: gpt-5-nano
# num_threads: Number of parallel threads
num_threads: 5
# ... (all other options documented)Validate that a URL is well-formed with HTTP/HTTPS scheme.
from apias.config import validate_url
print(validate_url("https://example.com")) # True
print(validate_url("http://localhost:3000")) # True
print(validate_url("example.com")) # False (no scheme)
print(validate_url("ftp://example.com")) # False (wrong scheme)Filter a list of URLs, returning only valid ones.
from apias.config import validate_urls
urls = [
"https://example.com",
"not-a-url",
"http://api.example.com/docs",
"ftp://invalid.com"
]
valid = validate_urls(urls)
print(valid) # ['https://example.com', 'http://api.example.com/docs']Estimate token count from text (approximately 4 characters per token).
from apias.config import estimate_tokens
text = "Hello, this is a sample text for token estimation."
tokens = estimate_tokens(text)
print(f"Estimated tokens: {tokens}") # ~12-13 tokensCalculate estimated API costs for given input tokens and output ratio.
Parameters:
input_tokens(int): Number of input tokensmodel(str): Model name (default:DEFAULT_MODEL)ratio(float): Output/input ratio (default:COST_RATIO_CONSERVATIVE)
Returns: Tuple of (input_cost, output_cost, total_cost) in USD
from apias.config import estimate_cost, COST_RATIO_AVERAGE
input_tokens = 100_000
input_cost, output_cost, total_cost = estimate_cost(
input_tokens,
model="gpt-5-nano",
ratio=COST_RATIO_AVERAGE # 1.84x output ratio
)
print(f"Input: ${input_cost:.4f}, Output: ${output_cost:.4f}, Total: ${total_cost:.4f}")Get cost estimates for all three scenarios (conservative, average, worst case).
from apias.config import get_cost_estimates, estimate_tokens
# Estimate costs for 1MB of text
text = "a" * 1_000_000
input_tokens = estimate_tokens(text)
estimates = get_cost_estimates(input_tokens, model="gpt-5-nano")
for scenario, data in estimates.items():
print(f"{scenario.upper()}:")
print(f" Output tokens: {data['output_tokens']:,}")
print(f" Input cost: ${data['input_cost']:.4f}")
print(f" Output cost: ${data['output_cost']:.4f}")
print(f" Total cost: ${data['total_cost']:.4f}")
print()Web scraper using Playwright for JavaScript-rendered pages.
Scraper(
playwright_available: bool | None = None,
verify_ssl: bool = True,
timeout: int = 30,
quiet: bool = False
)| Parameter | Type | Default | Description |
|---|---|---|---|
playwright_available |
bool | None |
None |
Override Playwright availability check |
verify_ssl |
bool |
True |
Verify SSL certificates |
timeout |
int |
30 |
Scraping timeout in seconds |
quiet |
bool |
False |
Suppress spinner output |
Scrape a URL and return the content and MIME type.
from apias.apias import Scraper
scraper = Scraper(quiet=True)
content, mime_type = scraper.scrape("https://docs.python.org/3/")
if content:
print(f"Content length: {len(content)} characters")
print(f"MIME type: {mime_type}")
else:
print("Scraping failed")Check if content appears to be HTML.
from apias.apias import Scraper
scraper = Scraper()
print(scraper.looks_like_html("<html><body>Hello</body></html>")) # True
print(scraper.looks_like_html("Just plain text")) # FalseHigh-level function to scrape a URL with automatic Playwright setup.
Parameters:
url(str): The URL to scrapeno_tui(bool): If True, disable TUI output (default: False)quiet(bool): If True, suppress all output (default: False)
Returns: Scraped content prefixed with URL, or None on failure
from apias.apias import start_scraping
# Basic usage
content = start_scraping("https://docs.python.org/3/", quiet=True)
if content:
print(f"Scraped {len(content)} characters")
# For batch processing (suppress output)
content = start_scraping("https://example.com/api", no_tui=True, quiet=True)Represents a parsed API document with extracted endpoints, methods, and descriptions.
APIDocument(content: str)| Attribute | Type | Description |
|---|---|---|
content |
str |
Raw document content |
endpoints |
List[str] |
Extracted API endpoints |
methods |
List[str] |
Extracted HTTP methods |
descriptions |
List[str] |
Extracted descriptions |
Convert document to Markdown format.
from apias.apias import APIDocument
content = """
GET /api/users - Get all users
POST /api/users - Create a new user
DELETE /api/users/{id} - Delete a user
"""
doc = APIDocument(content)
print(doc.to_markdown())Convert document to a dictionary.
from apias.apias import APIDocument
doc = APIDocument("GET /api/users - List users")
data = doc.to_json()
print(data)
# {'endpoints': [...], 'methods': [...], 'descriptions': [...]}Save document to a file in Markdown format.
from apias.apias import APIDocument
doc = APIDocument("GET /api/users - List users")
doc.save("api_docs.md")Parse raw API documentation content into a structured APIDocument.
from apias.apias import parse_documentation
content = """
# User API
GET /api/users
Returns a list of all users.
POST /api/users
Creates a new user.
"""
doc = parse_documentation(content)
print(f"Found {len(doc.endpoints)} endpoints")
print(f"Found {len(doc.methods)} methods")Validate a configuration dictionary for required fields and correct formats.
Required fields:
base_url: Must be a string starting with "http"output_format: Must be one of "markdown", "html", or "xml"
from apias.apias import validate_config
# Valid configuration
config = {
"base_url": "https://api.example.com",
"output_format": "xml"
}
print(validate_config(config)) # True
# Invalid - missing required field
config = {"output_format": "xml"}
print(validate_config(config)) # False
# Invalid - wrong output format
config = {
"base_url": "https://api.example.com",
"output_format": "pdf" # Not supported
}
print(validate_config(config)) # FalseClean HTML by removing navigation, scripts, styles, and comments.
from apias.apias import clean_html
raw_html = """
<html>
<head><style>.nav { color: red; }</style></head>
<body>
<nav>Navigation here</nav>
<main>
<h1>API Documentation</h1>
<p>This is the main content.</p>
</main>
<script>alert('hello');</script>
</body>
</html>
"""
cleaned = clean_html(raw_html)
print(cleaned)
# Returns HTML with nav, script, style removedProcess HTML and extract structured data (code examples, methods, classes, images, links).
Returns a tuple of:
- Cleaned HTML string
- Page title (or "scraped_page" if none)
- List of code examples (from
<pre>tags) - List of method signatures
- List of class definitions
- List of image URLs
- List of (href, text) link tuples
from apias.apias import slimdown_html
html = """
<html>
<head><title>My API</title></head>
<body>
<pre>def hello(): pass</pre>
<a href="/docs">Documentation</a>
<img src="logo.png">
</body>
</html>
"""
result = slimdown_html(html)
cleaned_html, title, code_examples, methods, classes, images, links = result
print(f"Title: {title}") # "My API"
print(f"Code examples: {len(code_examples)}") # 1
print(f"Links: {len(links)}") # 1
print(f"Images: {len(images)}") # 1Split large HTML content into smaller chunks for processing within token limits.
Parameters:
html_content(str): The HTML content to splitmax_chars(int): Maximum characters per chunk (default: ~200K)
Returns: List of HTML chunk strings
from apias.apias import chunk_html_by_size
# Large HTML content
large_html = "<html><body>" + "<p>content</p>" * 10000 + "</body></html>"
# Split into manageable chunks
chunks = chunk_html_by_size(large_html, max_chars=50000)
print(f"Split into {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
print(f" Chunk {i+1}: {len(chunk)} characters")Escape XML special characters in a string.
Escapes: ", ', <, >, &
from apias.apias import escape_xml
text = 'Hello <world> & "friends"'
escaped = escape_xml(text)
print(escaped) # Hello <world> & "friends"Validate that a string is well-formed XML.
from apias.apias import validate_xml
# Valid XML
print(validate_xml("<root><item>test</item></root>")) # True
# Invalid XML
print(validate_xml("<root><item>test</root>")) # False (unclosed tag)
print(validate_xml("not xml at all")) # FalseMerge multiple XML files from a folder into a single XML document.
Parameters:
temp_folder(Path): Path to folder containing XML filesprogress_callback(Callable[[int, int, str], None] | None): Optional callback(current, total, message)
Returns: Merged XML string
from pathlib import Path
from apias.apias import merge_xmls
def on_progress(current, total, message):
print(f"[{current}/{total}] {message}")
temp_folder = Path("/tmp/apias_output")
merged_xml = merge_xmls(temp_folder, progress_callback=on_progress)
print(f"Merged XML: {len(merged_xml)} characters")Extract and filter URLs from a sitemap XML file or content.
Parameters:
sitemap_file(str | None): Path to sitemap filesitemap_content(str | None): Sitemap XML content stringwhitelist_str(str | None): Comma-separated patterns to includeblacklist_str(str | None): Comma-separated patterns to exclude
from apias.apias import extract_urls_from_sitemap
# From file
urls = extract_urls_from_sitemap(sitemap_file="sitemap.xml")
# From string content
sitemap_content = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
<url><loc>https://example.com/admin/secret</loc></url>
</urlset>
"""
# With filtering
urls = extract_urls_from_sitemap(
sitemap_content=sitemap_content,
whitelist_str="*example.com*",
blacklist_str="*admin*,*secret*"
)
print(urls) # ['https://example.com/page1', 'https://example.com/page2']from pathlib import Path
from apias.config import APIASConfig, estimate_tokens, get_cost_estimates
from apias.apias import Scraper, clean_html, slimdown_html, validate_xml
def extract_api_docs(url: str, output_path: str) -> str | None:
"""Extract API documentation from a URL.
Args:
url: The URL to scrape
output_path: Where to save the processed HTML
Returns:
Processed HTML content, or None on failure
"""
# 1. Configure
config = APIASConfig(
model="gpt-5-nano",
num_threads=5,
quiet=True
)
# 2. Scrape the page
scraper = Scraper(quiet=True)
content, mime_type = scraper.scrape(url)
if not content:
print(f"Failed to scrape {url}")
return None
print(f"Scraped {len(content)} characters ({mime_type})")
# 3. Clean and process HTML
cleaned = clean_html(content)
result = slimdown_html(cleaned)
final_html, title, code_examples, methods, classes, images, links = result
print(f"Title: {title}")
print(f"Found {len(code_examples)} code examples")
print(f"Found {len(links)} links")
# 4. Estimate costs before processing
tokens = estimate_tokens(final_html)
estimates = get_cost_estimates(tokens, config.model)
print(f"\nCost estimates for {tokens:,} input tokens:")
print(f" Conservative: ${estimates['conservative']['total_cost']:.4f}")
print(f" Average: ${estimates['average']['total_cost']:.4f}")
print(f" Worst case: ${estimates['worst_case']['total_cost']:.4f}")
# 5. Validate any XML output
test_xml = f"<doc><title>{title}</title></doc>"
if validate_xml(test_xml):
print("\nXML validation: passed")
# 6. Save processed HTML
output = Path(output_path)
output.write_text(final_html)
print(f"\nSaved to {output_path}")
return final_html
# Run the extraction
if __name__ == "__main__":
extract_api_docs(
"https://docs.python.org/3/library/asyncio.html",
"asyncio_docs.html"
)from pathlib import Path
from apias.config import APIASConfig, validate_urls
from apias.apias import start_scraping, clean_html, chunk_html_by_size
def batch_scrape(urls: list[str], output_dir: str) -> dict[str, str]:
"""Scrape multiple URLs and save results.
Args:
urls: List of URLs to scrape
output_dir: Directory to save results
Returns:
Dict mapping URLs to output file paths
"""
# Validate URLs first
valid_urls = validate_urls(urls)
print(f"Valid URLs: {len(valid_urls)}/{len(urls)}")
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
results = {}
for i, url in enumerate(valid_urls, 1):
print(f"\n[{i}/{len(valid_urls)}] Scraping: {url}")
# Use start_scraping for automatic setup
content = start_scraping(url, quiet=True)
if not content:
print(f" Failed to scrape")
continue
# Clean and chunk if necessary
cleaned = clean_html(content)
chunks = chunk_html_by_size(cleaned, max_chars=100000)
# Save each chunk
for j, chunk in enumerate(chunks):
filename = f"page_{i}_chunk_{j}.html"
filepath = output_path / filename
filepath.write_text(chunk)
print(f" Saved: {filename} ({len(chunk)} chars)")
results[url] = str(output_path / f"page_{i}_chunk_0.html")
return results
if __name__ == "__main__":
urls = [
"https://docs.python.org/3/library/asyncio.html",
"https://docs.python.org/3/library/typing.html",
"https://docs.python.org/3/library/pathlib.html",
]
batch_scrape(urls, "/tmp/batch_output")All functions raise appropriate exceptions:
from apias.config import APIASConfig, validate_url
# Configuration validation
try:
config = APIASConfig(num_threads=-1) # Invalid
except ValueError as e:
print(f"Config error: {e}") # "num_threads must be at least 1"
try:
config = APIASConfig(temperature=3.0) # Invalid
except ValueError as e:
print(f"Config error: {e}") # "temperature must be between 0 and 2"
# API key validation
try:
config = APIASConfig()
key = config.get_api_key() # Raises if no key found
except ValueError as e:
print(f"API key error: {e}")
# File loading errors
try:
config = APIASConfig.from_yaml("nonexistent.yaml")
except FileNotFoundError as e:
print(f"File error: {e}")
# URL validation (returns bool, doesn't raise)
if not validate_url("invalid-url"):
print("Invalid URL format")APIAS uses comprehensive type hints throughout. Example:
from typing import Dict, List
from pathlib import Path
from apias.config import APIASConfig, load_config
from apias.apias import Scraper, APIDocument
def process_docs(urls: List[str]) -> Dict[str, APIDocument]:
"""Process multiple URLs and return documents."""
config: APIASConfig = load_config()
scraper: Scraper = Scraper(quiet=config.quiet)
results: Dict[str, APIDocument] = {}
for url in urls:
content, _ = scraper.scrape(url)
if content:
results[url] = APIDocument(content)
return results- README.md - General documentation and CLI usage
- CHANGELOG.md - Version history
- CONTRIBUTING.md - Contribution guidelines