Linkedin_AI_Agent/scraper_local.py at main · LiveWithCodeAnkit/Linkedin_AI_Agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
from typing import Dict
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv()

PLAYWRIGHT_USER_DATA_DIR = os.environ.get("PLAYWRIGHT_USER_DATA_DIR", ".playwright/linkedinsession")
HEADLESS = os.environ.get("PLAYWRIGHT_HEADLESS", "true").lower() == "true"


def _extract_from_html(html: str) -> Dict:
	"""Extracts profile data from HTML using OpenGraph + fallback selectors."""
	soup = BeautifulSoup(html, "html.parser")

	def og(prop: str) -> str:
		m = soup.select_one(f'meta[property="og:{prop}"]')
		return (m["content"].strip() if m and m.has_attr("content") else "")

	title = og("title") or (soup.title.string.strip() if soup.title and soup.title.string else "")
	description = og("description")
	image = og("image")

	# LinkedIn often renders name/headline in main h1/h2; try simple fallbacks
	name = title.replace("| LinkedIn", "").strip()
	if not name:
		h1 = soup.select_one("main h1") or soup.find("h1")
		name = h1.get_text(strip=True) if h1 else ""

	headline = description
	if not headline:
		h2 = soup.select_one("main h2") or soup.find("h2")
		headline = h2.get_text(strip=True) if h2 else ""

	return {
		"full_name": name or "",
		"headline": headline or "",
		"summary": description or headline or "",
		"profile_pic_url": image or "",
	}


def scrape_linkedin_profile_local(linkedin_profile_url: str) -> Dict:
	"""
	Scrape LinkedIn using a local browser session (Playwright persistent context).
	- First run: user logs in manually in the opened Chromium window.
	- Session cookies are stored and reused.
	Returns a dict; if login is required, returns {not_logged_in: True}.
	"""
	from playwright.sync_api import sync_playwright

	os.makedirs(PLAYWRIGHT_USER_DATA_DIR, exist_ok=True)

	with sync_playwright() as p:
		context = p.chromium.launch_persistent_context(
			PLAYWRIGHT_USER_DATA_DIR,
			headless=HEADLESS,
			args=["--disable-blink-features=AutomationControlled"],
		)
		page = context.new_page()

		page.goto(linkedin_profile_url, wait_until="domcontentloaded", timeout=120_000)

		# Login check
		try:
			if page.url.startswith("https://www.linkedin.com/login") or page.is_visible("input#username"):
				context.close()
				return {"not_logged_in": True}
		except Exception:
			pass

		# Let dynamic content load; scroll to trigger lazy loads
		try:
			page.wait_for_load_state("networkidle", timeout=10_000)
		except Exception:
			pass
		try:
			page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
			page.wait_for_timeout(1200)
			page.evaluate("window.scrollTo(0, 0)")
			page.wait_for_timeout(400)
		except Exception:
			pass

		html = page.content()
		data = _extract_from_html(html)

		# Sanity: if we only got generic LinkedIn image or empty fields, mark as likely blocked
		if not any([data.get("full_name"), data.get("headline")]) and "linkedin-bug" in (data.get("profile_pic_url") or ""):
			context.close()
			return {"not_found": True}

		context.close()
		return data