-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper_local.py
More file actions
94 lines (75 loc) · 2.89 KB
/
scraper_local.py
File metadata and controls
94 lines (75 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
from typing import Dict
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
PLAYWRIGHT_USER_DATA_DIR = os.environ.get("PLAYWRIGHT_USER_DATA_DIR", ".playwright/linkedinsession")
HEADLESS = os.environ.get("PLAYWRIGHT_HEADLESS", "true").lower() == "true"
def _extract_from_html(html: str) -> Dict:
"""Extracts profile data from HTML using OpenGraph + fallback selectors."""
soup = BeautifulSoup(html, "html.parser")
def og(prop: str) -> str:
m = soup.select_one(f'meta[property="og:{prop}"]')
return (m["content"].strip() if m and m.has_attr("content") else "")
title = og("title") or (soup.title.string.strip() if soup.title and soup.title.string else "")
description = og("description")
image = og("image")
# LinkedIn often renders name/headline in main h1/h2; try simple fallbacks
name = title.replace("| LinkedIn", "").strip()
if not name:
h1 = soup.select_one("main h1") or soup.find("h1")
name = h1.get_text(strip=True) if h1 else ""
headline = description
if not headline:
h2 = soup.select_one("main h2") or soup.find("h2")
headline = h2.get_text(strip=True) if h2 else ""
return {
"full_name": name or "",
"headline": headline or "",
"summary": description or headline or "",
"profile_pic_url": image or "",
}
def scrape_linkedin_profile_local(linkedin_profile_url: str) -> Dict:
"""
Scrape LinkedIn using a local browser session (Playwright persistent context).
- First run: user logs in manually in the opened Chromium window.
- Session cookies are stored and reused.
Returns a dict; if login is required, returns {not_logged_in: True}.
"""
from playwright.sync_api import sync_playwright
os.makedirs(PLAYWRIGHT_USER_DATA_DIR, exist_ok=True)
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
PLAYWRIGHT_USER_DATA_DIR,
headless=HEADLESS,
args=["--disable-blink-features=AutomationControlled"],
)
page = context.new_page()
page.goto(linkedin_profile_url, wait_until="domcontentloaded", timeout=120_000)
# Login check
try:
if page.url.startswith("https://www.linkedin.com/login") or page.is_visible("input#username"):
context.close()
return {"not_logged_in": True}
except Exception:
pass
# Let dynamic content load; scroll to trigger lazy loads
try:
page.wait_for_load_state("networkidle", timeout=10_000)
except Exception:
pass
try:
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(1200)
page.evaluate("window.scrollTo(0, 0)")
page.wait_for_timeout(400)
except Exception:
pass
html = page.content()
data = _extract_from_html(html)
# Sanity: if we only got generic LinkedIn image or empty fields, mark as likely blocked
if not any([data.get("full_name"), data.get("headline")]) and "linkedin-bug" in (data.get("profile_pic_url") or ""):
context.close()
return {"not_found": True}
context.close()
return data