-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper_selenium.py
More file actions
43 lines (37 loc) · 1.31 KB
/
scraper_selenium.py
File metadata and controls
43 lines (37 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
from typing import Dict
from bs4 import BeautifulSoup
import undetected_chromedriver as uc
USER_DATA_DIR = os.environ.get("SELENIUM_USER_DATA_DIR", os.path.abspath(".selenium_profile"))
HEADLESS = os.environ.get("SELENIUM_HEADLESS", "true").lower() == "true"
def _extract(html: str) -> Dict:
soup = BeautifulSoup(html, "html.parser")
name = ""
headline = ""
img = ""
if soup.title and soup.title.string:
name = soup.title.string.replace("| LinkedIn", "").strip()
m = soup.select_one('meta[property="og:description"]')
if m and m.get("content"):
headline = m["content"].strip()
m2 = soup.select_one('meta[property="og:image"]')
if m2 and m2.get("content"):
img = m2["content"].strip()
return {"full_name": name, "headline": headline, "summary": headline, "profile_pic_url": img}
def scrape_linkedin_profile_selenium(url: str) -> Dict:
opts = uc.ChromeOptions()
if HEADLESS:
opts.add_argument("--headless=new")
opts.add_argument(f"--user-data-dir={USER_DATA_DIR}")
opts.add_argument("--disable-blink-features=AutomationControlled")
driver = uc.Chrome(options=opts)
try:
driver.get(url)
driver.implicitly_wait(10)
html = driver.page_source
data = _extract(html)
if not any([data.get("full_name"), data.get("headline")]):
return {"not_found": True}
return data
finally:
driver.quit()