-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_reference_extraction.py
More file actions
191 lines (148 loc) Β· 6.55 KB
/
test_reference_extraction.py
File metadata and controls
191 lines (148 loc) Β· 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
Test script for multi-level reference extraction functionality.
"""
import asyncio
import sys
import os
# Add the agentic_search module to the path
sys.path.append(os.path.join(os.path.dirname(__file__), 'agentic_search'))
from agentic_search.enhanced_research_pipeline import research_startup_idea_comprehensive
async def test_reference_extraction():
"""Test the enhanced research pipeline with reference extraction."""
# Example startup idea
idea_summary = """
A SaaS platform that uses AI to automatically generate and optimize
social media content for small businesses. The platform analyzes
competitor content, trending topics, and brand voice to create
personalized posts across multiple social media channels.
"""
print("π Testing Multi-Level Reference Extraction")
print("=" * 60)
print(f"Startup Idea: {idea_summary.strip()}")
print("=" * 60)
try:
# Run comprehensive research with reference extraction enabled
results = await research_startup_idea_comprehensive(
idea_summary=idea_summary,
max_depth=2, # Go 2 levels deep
max_pages_per_level=3, # Process 3 pages per level
max_search_results=5, # 5 search results per query
enable_reference_extraction=True
)
print("\nβ
Test completed successfully!")
# Print some key statistics
summary = results['pipeline_summary']
print(f"\nπ FINAL STATISTICS:")
print(f" β’ Search Queries: {summary['queries_executed']}")
print(f" β’ Initial Results: {summary['initial_search_results']}")
print(f" β’ Total Pages Analyzed: {summary['total_pages_analyzed']}")
print(f" β’ Reference Levels: {summary['reference_levels_processed']}")
return results
except Exception as e:
print(f"β Test failed: {str(e)}")
import traceback
traceback.print_exc()
return None
async def test_basic_reference_extraction():
"""Test basic reference extraction on a few URLs."""
# Import the reference extractor
from agentic_search.reference_extractor import ReferenceExtractor
print("\nπ Testing Basic Reference Extraction")
print("=" * 50)
# Create extractor
extractor = ReferenceExtractor(max_depth=2, max_pages_per_level=10)
# Test URLs (using reliable news/tech sites)
test_urls = [
"https://techcrunch.com/2024/01/01/ai-startups-2024/",
"https://www.forbes.com/sites/forbestechcouncil/",
"https://venturebeat.com/ai/"
]
try:
print(f"Testing with {len(test_urls)} URLs...")
# Extract multi-level references
result = await extractor.extract_multi_level_references(test_urls)
print(f"\nβ
Reference extraction completed!")
print(f" β’ Total pages scraped: {result['summary']['total_pages_scraped']}")
print(f" β’ Total references found: {result['summary']['total_references_found']}")
print(f" β’ Levels processed: {result['summary']['levels_processed']}")
# Show some example references
if result['all_pages']:
print(f"\nπ Example page content:")
for i, page in enumerate(result['all_pages'][:2]):
print(f" {i+1}. {page['title']}")
print(f" URL: {page['url']}")
print(f" Content length: {page['content_length']} chars")
print(f" References found: {page['reference_count']}")
if page['references']:
print(f" Top references:")
for ref in page['references'][:3]:
print(f" β’ {ref['anchor_text'][:50]}{'...' if len(ref['anchor_text']) > 50 else ''}")
print(f" {ref['url']}")
return result
except Exception as e:
print(f"β Basic test failed: {str(e)}")
import traceback
traceback.print_exc()
return None
def show_usage_examples():
"""Show how to use the multi-level reference extraction."""
print("\n" + "="*80)
print("π USAGE EXAMPLES FOR MULTI-LEVEL REFERENCE EXTRACTION")
print("="*80)
print("""
π Example 1: Basic Reference Extraction
```python
from agentic_search.reference_extractor import ReferenceExtractor
# Create extractor
extractor = ReferenceExtractor(max_depth=2, max_pages_per_level=5)
# Extract from URLs
urls = ["https://example.com/page1", "https://example.com/page2"]
result = await extractor.extract_multi_level_references(urls)
print(f"Scraped {result['summary']['total_pages_scraped']} pages")
print(f"Found {result['summary']['total_references_found']} references")
```
π Example 2: Enhanced Research Pipeline
```python
from agentic_search.enhanced_research_pipeline import research_startup_idea_comprehensive
# Run comprehensive research
results = await research_startup_idea_comprehensive(
idea_summary="Your startup idea here...",
max_depth=2, # Go 2 levels deep
max_pages_per_level=5, # 5 pages per level
max_search_results=10, # 10 search results per query
enable_reference_extraction=True
)
```
π Example 3: Enhance Existing Brave Search Results
```python
from agentic_search.brave_search import BraveSearchClient
from agentic_search.reference_extractor import enhance_search_with_references
# Get initial search results
client = BraveSearchClient()
search_response = await client.search("AI startups 2024")
brave_results = client.extract_web_results(search_response)
# Enhance with reference extraction
enhanced = await enhance_search_with_references(
brave_results,
max_depth=2,
max_pages_per_level=3
)
```
π― Key Benefits:
β’ Discovers content not found in initial search results
β’ Finds references, citations, and related studies
β’ Builds comprehensive knowledge graphs
β’ Identifies competitor mentions and market data
β’ Extracts deeper insights from industry reports
""")
if __name__ == "__main__":
print("π§ͺ Multi-Level Reference Extraction Test Suite")
print("=" * 60)
# Show usage examples first
show_usage_examples()
print("\nβ οΈ Note: Some dependencies might be missing (beautifulsoup4, httpx)")
print("Install them with: pip install beautifulsoup4 httpx")
# Uncomment to run actual tests
# print("\nπ Running tests...")
# asyncio.run(test_basic_reference_extraction())
# asyncio.run(test_reference_extraction())