leadgenerater/tinystartups.py at main · a-nnurag/leadgenerater · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from urllib.parse import urlparse
import time
from websiteparser import SocialMediaScraper
from concurrent.futures import ThreadPoolExecutor
import asyncio
import json
import undetected_chromedriver as uc
import os
from dotenv import load_dotenv
load_dotenv()

def is_outgoing_link(href, base_domain="tinystartups.com"):
    """
    Check if a link is outgoing (external) by comparing domains
    """
    try:
        if not href:
            return False

        # Handle relative URLs
        if href.startswith('/') or href.startswith('#') or href.startswith('?'):
            return False

        # Handle mailto, tel, javascript links
        if href.startswith(('mailto:', 'tel:', 'javascript:')):
            return False

        # Parse the URL to get the domain
        parsed_url = urlparse(href)

        # If no scheme, it's likely a relative URL
        if not parsed_url.scheme:
            return False

        # Extract domain from the href
        link_domain = parsed_url.netloc.lower()

        # Remove www. prefix for comparison
        link_domain = link_domain.replace('www.', '')
        base_domain = base_domain.replace('www.', '')

        # Check if it's an outgoing link
        return link_domain != base_domain and link_domain != ''

    except Exception as e:
        print(f"Error parsing URL '{href}': {str(e)}")
        return False
def scrape_tinystartups_links():
    """
    Scrape outgoing href links from anchor tags within div elements with role='listitem'
    from tinystartups.com
    """

    # Initialize the Chrome driver (make sure chromedriver is in your PATH)
    # driver = webdriver.Chrome()
    driver = uc.Chrome(headless=True,use_subprocess=True)


    try:
        # Navigate to the URL
        url = "https://www.tinystartups.com/"
        print(f"Navigating to: {url}")
        driver.get(url)

        # Wait for page to load
        wait = WebDriverWait(driver, 10)

        # Wait for the page to load and find the div elements
        print("Waiting for page to load...")
        time.sleep(3)  # Give the page time to fully load

        # Find all div elements with role='listitem'
        print("Finding div elements with role='listitem'...")
        elements = driver.find_elements(By.XPATH, "//div[contains(@role, 'listitem')]")

        print(f"Found {len(elements)} div elements with role='listitem'")

        # Store all outgoing href links
        outgoing_links = []
        internal_links = []

        # Iterate through each div element
        for i, div_element in enumerate(elements):
            try:
                # Find anchor tags within this div
                anchor_tags = div_element.find_elements(By.TAG_NAME, "a")

                print(f"Div {i+1}: Found {len(anchor_tags)} anchor tags")

                # Extract href from each anchor tag
                for anchor in anchor_tags:
                    href = anchor.get_attribute("href")
                    if href:  # Only process non-empty href values
                        if is_outgoing_link(href):
                            outgoing_links.append(href)
                            print(f"  - OUTGOING: {href}")
                        else:
                            internal_links.append(href)
                            print(f"  - internal: {href}")

            except NoSuchElementException:
                print(f"Div {i+1}: No anchor tags found")
                continue

        print(f"\nTotal outgoing links found: {len(outgoing_links)}")
        print(f"Total internal links found: {len(internal_links)}")
        print(f"Total unique outgoing links: {len(set(outgoing_links))}")

        return outgoing_links, internal_links

    except TimeoutException:
        print("Timeout: Page took too long to load")
        return [], []

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return [], []

    finally:
        # Close the browser
        driver.quit()

def save_links_to_file(links, filename="tinystartups_outgoing_links.json"):
    """Save the scraped outgoing links to a text file"""
    try:
        with open(filename, 'w') as json_file:
            json.dump(links, json_file, indent=4)
    except Exception as e:
        print(f"Error saving links to file: {str(e)}")

async def tinyStartupScraperExecutor():
     # Scrape the links
        outgoing_links, internal_links = scrape_tinystartups_links()
        GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") # Replace with your actual API key

        if outgoing_links:
            unique_outgoing_links = list(set(outgoing_links))
            print("\n" + "="*50)
            print("UNIQUE OUTGOING LINKS FOUND:")
            print("="*50)
            scraper = SocialMediaScraper(GEMINI_API_KEY)

            # Limit concurrency to 10
            semaphore = asyncio.Semaphore(10)

            async def limited_scrape(url):
                async with semaphore:
                    return await scraper.scrape_social_links(url)

            tasks = [limited_scrape(url) for url in unique_outgoing_links]
            results = await asyncio.gather(*tasks)

            scraper.save_links_to_file(results)

        else:
            print("No outgoing links were found or an error occurred.")

# Main execution
if __name__ == "__main__":
    asyncio.run(tinyStartupScraperExecutor())