-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtinystartups.py
More file actions
168 lines (131 loc) · 5.81 KB
/
tinystartups.py
File metadata and controls
168 lines (131 loc) · 5.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from urllib.parse import urlparse
import time
from websiteparser import SocialMediaScraper
from concurrent.futures import ThreadPoolExecutor
import asyncio
import json
import undetected_chromedriver as uc
import os
from dotenv import load_dotenv
load_dotenv()
def is_outgoing_link(href, base_domain="tinystartups.com"):
"""
Check if a link is outgoing (external) by comparing domains
"""
try:
if not href:
return False
# Handle relative URLs
if href.startswith('/') or href.startswith('#') or href.startswith('?'):
return False
# Handle mailto, tel, javascript links
if href.startswith(('mailto:', 'tel:', 'javascript:')):
return False
# Parse the URL to get the domain
parsed_url = urlparse(href)
# If no scheme, it's likely a relative URL
if not parsed_url.scheme:
return False
# Extract domain from the href
link_domain = parsed_url.netloc.lower()
# Remove www. prefix for comparison
link_domain = link_domain.replace('www.', '')
base_domain = base_domain.replace('www.', '')
# Check if it's an outgoing link
return link_domain != base_domain and link_domain != ''
except Exception as e:
print(f"Error parsing URL '{href}': {str(e)}")
return False
def scrape_tinystartups_links():
"""
Scrape outgoing href links from anchor tags within div elements with role='listitem'
from tinystartups.com
"""
# Initialize the Chrome driver (make sure chromedriver is in your PATH)
# driver = webdriver.Chrome()
driver = uc.Chrome(headless=True,use_subprocess=True)
try:
# Navigate to the URL
url = "https://www.tinystartups.com/"
print(f"Navigating to: {url}")
driver.get(url)
# Wait for page to load
wait = WebDriverWait(driver, 10)
# Wait for the page to load and find the div elements
print("Waiting for page to load...")
time.sleep(3) # Give the page time to fully load
# Find all div elements with role='listitem'
print("Finding div elements with role='listitem'...")
elements = driver.find_elements(By.XPATH, "//div[contains(@role, 'listitem')]")
print(f"Found {len(elements)} div elements with role='listitem'")
# Store all outgoing href links
outgoing_links = []
internal_links = []
# Iterate through each div element
for i, div_element in enumerate(elements):
try:
# Find anchor tags within this div
anchor_tags = div_element.find_elements(By.TAG_NAME, "a")
print(f"Div {i+1}: Found {len(anchor_tags)} anchor tags")
# Extract href from each anchor tag
for anchor in anchor_tags:
href = anchor.get_attribute("href")
if href: # Only process non-empty href values
if is_outgoing_link(href):
outgoing_links.append(href)
print(f" - OUTGOING: {href}")
else:
internal_links.append(href)
print(f" - internal: {href}")
except NoSuchElementException:
print(f"Div {i+1}: No anchor tags found")
continue
print(f"\nTotal outgoing links found: {len(outgoing_links)}")
print(f"Total internal links found: {len(internal_links)}")
print(f"Total unique outgoing links: {len(set(outgoing_links))}")
return outgoing_links, internal_links
except TimeoutException:
print("Timeout: Page took too long to load")
return [], []
except Exception as e:
print(f"An error occurred: {str(e)}")
return [], []
finally:
# Close the browser
driver.quit()
def save_links_to_file(links, filename="tinystartups_outgoing_links.json"):
"""Save the scraped outgoing links to a text file"""
try:
with open(filename, 'w') as json_file:
json.dump(links, json_file, indent=4)
except Exception as e:
print(f"Error saving links to file: {str(e)}")
async def tinyStartupScraperExecutor():
# Scrape the links
outgoing_links, internal_links = scrape_tinystartups_links()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") # Replace with your actual API key
if outgoing_links:
unique_outgoing_links = list(set(outgoing_links))
print("\n" + "="*50)
print("UNIQUE OUTGOING LINKS FOUND:")
print("="*50)
scraper = SocialMediaScraper(GEMINI_API_KEY)
# Limit concurrency to 10
semaphore = asyncio.Semaphore(10)
async def limited_scrape(url):
async with semaphore:
return await scraper.scrape_social_links(url)
tasks = [limited_scrape(url) for url in unique_outgoing_links]
results = await asyncio.gather(*tasks)
scraper.save_links_to_file(results)
else:
print("No outgoing links were found or an error occurred.")
# Main execution
if __name__ == "__main__":
asyncio.run(tinyStartupScraperExecutor())