The /extract endpoint is for uploading files and extracting text content from them. It supports multiple file types and processors.
curl -X POST "http://localhost:8000/extract" \
-F "file=@your_document.pdf"curl -X POST "http://localhost:8000/extract" \
-F "file=@your_document.pdf" | python -m json.toolThe endpoint supports:
- PDFs:
.pdf - Text files:
.txt - Word documents:
.doc,.docx - Excel files:
.xls,.xlsx - Images:
.jpg,.jpeg,.png,.gif,.bmp,.tiff,.webp - Videos:
.mp4,.avi,.mov,.mkv,.webm - Audio:
.mp3,.wav,.m4a,.ogg
curl -X POST "http://localhost:8000/extract" \
-F "file=@report.pdf"curl -X POST "http://localhost:8000/extract" \
-F "file=@document.docx"curl -X POST "http://localhost:8000/extract" \
-F "file=@screenshot.png"curl -X POST "http://localhost:8000/extract" \
-F "file=@recording.mp3"curl -X POST "http://localhost:8000/extract" \
-F "file=@video.mp4"import requests
from pathlib import Path
def extract_text_from_file(file_path):
"""Extract text from a file using the API."""
url = "http://localhost:8000/extract"
with open(file_path, 'rb') as f:
files = {'file': (Path(file_path).name, f)}
response = requests.post(url, files=files)
if response.status_code == 200:
result = response.json()
if result['success']:
return result['extracted_text']
else:
print(f"Error: {result['error']}")
return None
else:
print(f"HTTP Error: {response.status_code}")
return None
# Usage examples
text = extract_text_from_file("document.pdf")
print(text)
text = extract_text_from_file("image.png")
print(text)
text = extract_text_from_file("audio.mp3")
print(text)import requests
import json
from pathlib import Path
from typing import Dict, Any, Optional
class ContentExtractor:
def __init__(self, base_url: str = "http://localhost:8000"):
self.base_url = base_url.rstrip('/')
def extract_from_file(self, file_path: str, timeout: int = 300) -> Dict[str, Any]:
"""Extract content from a file with comprehensive error handling."""
file_path = Path(file_path)
if not file_path.exists():
return {"success": False, "error": f"File not found: {file_path}"}
try:
with open(file_path, 'rb') as f:
files = {'file': (file_path.name, f)}
response = requests.post(
f"{self.base_url}/extract",
files=files,
timeout=timeout
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
return {"success": False, "error": f"Request failed: {str(e)}"}
except Exception as e:
return {"success": False, "error": f"Unexpected error: {str(e)}"}
def extract_text_only(self, file_path: str) -> Optional[str]:
"""Extract just the text content."""
result = self.extract_from_file(file_path)
return result.get('extracted_text') if result.get('success') else None
# Usage
extractor = ContentExtractor()
# Extract full details
result = extractor.extract_from_file("document.pdf")
if result['success']:
print(f"Text: {result['extracted_text']}")
print(f"Processor: {result['processor_used']}")
print(f"Processing time: {result['processing_time']}s")
else:
print(f"Error: {result['error']}")
# Extract text only
text = extractor.extract_text_only("image.png")
if text:
print(text){
"file_id": "uuid-string",
"file_info": {
"name": "document.pdf",
"extension": ".pdf",
"size_mb": 2.5,
"mime_type": "application/pdf",
"duration": null,
"chunks_processed": 3
},
"success": true,
"error": null,
"extracted_text": "Your extracted content here...",
"processor_used": "OpenAIProcessor",
"processing_time": 12.34,
"timestamp": "2025-09-22T15:30:45.123456"
}{
"file_id": "uuid-string",
"file_info": {
"name": "document.pdf",
"extension": ".pdf",
"size_mb": 2.5,
"mime_type": "application/pdf"
},
"success": false,
"error": "Processing failed: API quota exceeded",
"extracted_text": null,
"processor_used": null,
"processing_time": 0.5,
"timestamp": "2025-09-22T15:30:45.123456"
}curl -X GET "http://localhost:8000/supported-types"curl -X POST "http://localhost:8000/extract-batch" \
-F "files=@document1.pdf" \
-F "files=@document2.docx" \
-F "files=@image.png"curl -X POST "http://localhost:8000/extract" \
-F "file=@large_video.mp4" \
--max-time 600# Extract text from scanned PDFs
curl -X POST "http://localhost:8000/extract" -F "file=@scanned_document.pdf"
# Process Word documents
curl -X POST "http://localhost:8000/extract" -F "file=@contract.docx"# Transcribe meeting recordings
curl -X POST "http://localhost:8000/extract" -F "file=@meeting.mp3"
# Extract text from presentation videos
curl -X POST "http://localhost:8000/extract" -F "file=@presentation.mp4"# Extract text from screenshots
curl -X POST "http://localhost:8000/extract" -F "file=@screenshot.png"
# Process handwritten notes
curl -X POST "http://localhost:8000/extract" -F "file=@notes.jpg"-
File Size Limits: Default maximum is 100MB
-
Processing Time: Varies by file type and size
- Text files: ~0.1s
- Images: ~3-5s
- Audio: ~30s per minute
- Video: ~30s per minute
- Large PDFs: ~1-2s per page
-
Supported Processors:
- OpenAI (GPT + Vision + Whisper)
- Google Gemini (Text + Vision + Audio)
- Automatic fallback between processors
-
API Keys Required: Ensure you have OpenAI and/or Google API keys configured
-
"Unsupported file type"
# Check supported types curl -X GET "http://localhost:8000/supported-types"
-
"File too large"
# Check file size ls -lh your_file.pdf # Compress or split large files
-
"API quota exceeded"
- Check your OpenAI/Google API billing
- The system will automatically fallback to available processors
-
"Connection timeout"
# Increase timeout for large files curl --max-time 600 -X POST "http://localhost:8000/extract" -F "file=@large_file.mp4"
/extract-url- Extract from file URLs/extract-batch- Process multiple files/transcribe-image- Specialized image processing/extract-youtube- YouTube video processing
View interactive documentation at: http://localhost:8000/docs