-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
63 lines (55 loc) · 2.29 KB
/
scraper.py
File metadata and controls
63 lines (55 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
### **🛠️ Code (scraper.py)**
```python
import time
import random
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome, ChromeOptions
import pandas as pd
import openai
# Load config
config = configparser.ConfigParser()
config.read('config.ini')
# Set up Selenium
options = ChromeOptions()
options.add_argument("--headless") # Run in background
driver = Chrome(options=options)
def login_to_linkedin():
driver.get("https://www.linkedin.com/login")
time.sleep(random.uniform(1, 3))
driver.find_element(By.ID, "username").send_keys(config['linkedin']['email'])
driver.find_element(By.ID, "password").send_keys(config['linkedin']['password'])
driver.find_element(By.XPATH, "//button[@type='submit']").click()
time.sleep(random.uniform(2, 5))
def scrape_profiles(search_query, pages=3):
profiles = []
for page in range(1, pages + 1):
url = f"https://www.linkedin.com/search/results/people/?keywords={search_query}&page={page}"
driver.get(url)
time.sleep(random.uniform(3, 7)) # Human-like delay
# Extract profile data
for element in driver.find_elements(By.CLASS_NAME, "entity-result__item"):
name = element.find_element(By.CLASS_NAME, "entity-result__title-text").text
title = element.find_element(By.CLASS_NAME, "entity-result__primary-subtitle").text
company = element.find_element(By.CLASS_NAME, "entity-result__secondary-subtitle").text
profiles.append({"Name": name, "Title": title, "Company": company})
return pd.DataFrame(profiles)
def enrich_with_openai(df):
openai.api_key = config['openai']['api_key']
for idx, row in df.iterrows():
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[{
"role": "user",
"content": f"Predict the email of {row['Name']} at {row['Company']} in format firstname.lastname@company.com"
}]
)
df.at[idx, 'Email'] = response.choices[0].message.content
return df
if __name__ == "__main__":
login_to_linkedin()
df = scrape_profiles("CTO at SaaS startups", pages=2)
df = enrich_with_openai(df)
df.to_csv("leads.csv", index=False)
driver.quit()