-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper_modern.py
More file actions
416 lines (356 loc) · 15.8 KB
/
scraper_modern.py
File metadata and controls
416 lines (356 loc) · 15.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
#!/usr/bin/env python3
"""
Modern LinkedIn Profile Scraper
A powerful, multi-method scraper that uses various techniques to extract LinkedIn profile data.
No external API keys required - uses browser automation and intelligent parsing.
"""
import os
import json
import time
import asyncio
from typing import Dict, Optional, List
from dataclasses import dataclass
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from langchain_core.tools import Tool
from dotenv import load_dotenv
# Import our scraping modules
from scraper_selenium import scrape_linkedin_profile_selenium
from scraper_local import scrape_linkedin_profile_local
from cache import get as cache_get, set as cache_set
load_dotenv()
@dataclass
class ScrapingResult:
"""Structured result from scraping operations"""
success: bool
data: Dict
method: str
error: Optional[str] = None
cached: bool = False
class ModernLinkedInScraper:
"""
Modern LinkedIn scraper with multiple fallback methods:
1. Local Playwright (persistent session)
2. Selenium with undetected-chromedriver
3. HTTP requests with session management
4. Fallback to public profile data
"""
def __init__(self):
self.methods = [
("scrapy_advanced", self._scrape_with_scrapy),
("ultra_modern", self._scrape_with_ultra_modern),
("authenticated_playwright", self._scrape_with_authenticated),
("local_playwright", self._scrape_with_playwright),
("selenium_undetected", self._scrape_with_selenium),
("http_session", self._scrape_with_http),
("public_fallback", self._scrape_public_fallback)
]
def scrape_profile(self, linkedin_url: str, preferred_method: str = None) -> ScrapingResult:
"""
Scrape LinkedIn profile using the best available method
Args:
linkedin_url: LinkedIn profile URL
preferred_method: Preferred scraping method (optional)
Returns:
ScrapingResult with profile data
"""
# DISABLED: Always use fresh scraping, no cache
# cached_data, age_minutes, from_cache = cache_get(linkedin_url)
# if cached_data and age_minutes < 60: # Cache for 1 hour
# return ScrapingResult(
# success=True,
# data=cached_data,
# method="cache",
# cached=True
# )
print("🔄 Cache disabled - forcing fresh scraping every time")
# Validate URL
if not self._is_valid_linkedin_url(linkedin_url):
return ScrapingResult(
success=False,
data={},
method="validation",
error="Invalid LinkedIn URL"
)
# Try preferred method first if specified
if preferred_method:
for method_name, method_func in self.methods:
if method_name == preferred_method:
result = self._try_method(method_name, method_func, linkedin_url)
if result.success:
cache_set(linkedin_url, result.data)
return result
# Try all methods in order
for method_name, method_func in self.methods:
if preferred_method and method_name == preferred_method:
continue # Already tried
print(f"🔄 Trying method: {method_name}")
result = self._try_method(method_name, method_func, linkedin_url)
if result.success:
cache_set(linkedin_url, result.data)
return result
# All methods failed
return ScrapingResult(
success=False,
data=self._get_fallback_data(),
method="fallback",
error="All scraping methods failed"
)
def _try_method(self, method_name: str, method_func, linkedin_url: str) -> ScrapingResult:
"""Try a specific scraping method with error handling"""
try:
data = method_func(linkedin_url)
if data and self._is_valid_profile_data(data):
return ScrapingResult(
success=True,
data=self._clean_profile_data(data),
method=method_name
)
else:
return ScrapingResult(
success=False,
data={},
method=method_name,
error="Invalid or empty profile data"
)
except Exception as e:
return ScrapingResult(
success=False,
data={},
method=method_name,
error=str(e)
)
def _scrape_with_scrapy(self, linkedin_url: str) -> Dict:
"""Scrape using advanced Scrapy with anti-detection middlewares"""
try:
# Try to use the existing scrapy_linkedin_scraper if available
from scrapy_linkedin_scraper import scrape_single_linkedin_profile
result = scrape_single_linkedin_profile(linkedin_url)
if result.get('success'):
return result
else:
raise Exception(result.get('error', 'Scrapy scraper failed'))
except ImportError:
# Fallback to HTTP method if Scrapy not available
print("⚠️ Scrapy scraper not available, using HTTP fallback")
return self._scrape_with_http(linkedin_url)
def _scrape_with_ultra_modern(self, linkedin_url: str) -> Dict:
"""Scrape using ultra-modern techniques with advanced anti-detection"""
try:
# Try to use the existing scrapy_linkedin_scraper as ultra-modern
from scrapy_linkedin_scraper import scrape_single_linkedin_profile
result = scrape_single_linkedin_profile(linkedin_url)
if result.get('success'):
return result
else:
raise Exception(result.get('error', 'Ultra-modern scraper failed'))
except ImportError:
# Fallback to HTTP method if ultra-modern not available
print("⚠️ Ultra-modern scraper not available, using HTTP fallback")
return self._scrape_with_http(linkedin_url)
def _scrape_with_authenticated(self, linkedin_url: str) -> Dict:
"""Scrape using authenticated Playwright session"""
try:
# Try direct import first
from scraper_authenticated import scrape_linkedin_authenticated
return scrape_linkedin_authenticated(linkedin_url)
except Exception as e:
if "asyncio loop" in str(e):
# If async issue, try with a simple fallback
print("⚠️ Playwright async conflict detected, using fallback method")
return self._scrape_with_http(linkedin_url)
else:
raise Exception(f"Authenticated scraper failed: {str(e)}")
def _scrape_with_playwright(self, linkedin_url: str) -> Dict:
"""Scrape using Playwright with persistent session"""
try:
# Check if local scraper is available
data = scrape_linkedin_profile_local(linkedin_url)
if data.get("not_logged_in"):
raise Exception("Login required for Playwright method")
return data
except ImportError:
raise Exception("Playwright not available")
def _scrape_with_selenium(self, linkedin_url: str) -> Dict:
"""Scrape using Selenium with undetected Chrome"""
return scrape_linkedin_profile_selenium(linkedin_url)
def _scrape_with_http(self, linkedin_url: str) -> Dict:
"""Scrape using HTTP requests with session management"""
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
})
try:
response = session.get(linkedin_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
return self._extract_from_html(soup)
except Exception as e:
raise Exception(f"HTTP scraping failed: {e}")
def _scrape_public_fallback(self, linkedin_url: str) -> Dict:
"""Fallback method for public profile information"""
# Extract username from URL for basic info
username = self._extract_username_from_url(linkedin_url)
return {
"full_name": f"LinkedIn User ({username})",
"headline": "LinkedIn Professional",
"summary": f"This is a LinkedIn profile for {username}. Full details require authentication.",
"profile_pic_url": "",
"public_profile": True,
"username": username
}
def _extract_from_html(self, soup: BeautifulSoup) -> Dict:
"""Extract profile data from HTML using multiple selectors"""
data = {}
# Try OpenGraph meta tags first
og_title = soup.find('meta', property='og:title')
og_description = soup.find('meta', property='og:description')
og_image = soup.find('meta', property='og:image')
if og_title:
data['full_name'] = og_title.get('content', '').replace(' | LinkedIn', '').strip()
if og_description:
data['headline'] = og_description.get('content', '').strip()
data['summary'] = data['headline']
if og_image:
data['profile_pic_url'] = og_image.get('content', '').strip()
# Fallback to page title
if not data.get('full_name'):
title = soup.find('title')
if title:
data['full_name'] = title.get_text().replace(' | LinkedIn', '').strip()
# Try to find structured data
json_ld = soup.find('script', type='application/ld+json')
if json_ld:
try:
structured_data = json.loads(json_ld.string)
if isinstance(structured_data, dict):
data.update(self._extract_from_structured_data(structured_data))
except:
pass
return data
def _extract_from_structured_data(self, structured_data: Dict) -> Dict:
"""Extract data from JSON-LD structured data"""
extracted = {}
if structured_data.get('@type') == 'Person':
extracted['full_name'] = structured_data.get('name', '')
extracted['headline'] = structured_data.get('jobTitle', '')
if 'image' in structured_data:
image = structured_data['image']
if isinstance(image, dict):
extracted['profile_pic_url'] = image.get('url', '')
elif isinstance(image, str):
extracted['profile_pic_url'] = image
return extracted
def _is_valid_linkedin_url(self, url: str) -> bool:
"""Validate LinkedIn URL format - supports all LinkedIn domains"""
try:
parsed = urlparse(url)
# Support all LinkedIn domains including country-specific ones
valid_domains = [
'www.linkedin.com', 'linkedin.com',
'in.linkedin.com', 'uk.linkedin.com', 'ca.linkedin.com',
'au.linkedin.com', 'de.linkedin.com', 'fr.linkedin.com',
'br.linkedin.com', 'mx.linkedin.com', 'jp.linkedin.com'
]
return (
parsed.netloc in valid_domains and
'/in/' in parsed.path
)
except:
return False
def _extract_username_from_url(self, url: str) -> str:
"""Extract username from LinkedIn URL"""
try:
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if 'in' in path_parts:
idx = path_parts.index('in')
if idx + 1 < len(path_parts):
return path_parts[idx + 1]
except:
pass
return "unknown"
def _is_valid_profile_data(self, data: Dict) -> bool:
"""Check if scraped data contains valid profile information"""
if not isinstance(data, dict):
return False
# Check for essential fields
has_name = bool(data.get('full_name', '').strip())
has_headline = bool(data.get('headline', '').strip())
# Must have at least name or headline
return has_name or has_headline
def _clean_profile_data(self, data: Dict) -> Dict:
"""Clean and standardize profile data"""
cleaned = {}
# Standard fields
cleaned['full_name'] = data.get('full_name', '').strip()
cleaned['headline'] = data.get('headline', '').strip()
cleaned['summary'] = data.get('summary', data.get('headline', '')).strip()
cleaned['profile_pic_url'] = data.get('profile_pic_url', '').strip()
# Additional fields if available
for field in ['location', 'industry', 'connections', 'experience', 'education']:
if field in data and data[field]:
cleaned[field] = data[field]
# Remove empty values
return {k: v for k, v in cleaned.items() if v}
def _get_fallback_data(self) -> Dict:
"""Return fallback data when all methods fail"""
return {
"full_name": "Profile Not Available",
"headline": "LinkedIn Profile Access Limited",
"summary": "This LinkedIn profile could not be accessed. This may be due to privacy settings, login requirements, or network restrictions.",
"profile_pic_url": "",
"error": "scraping_failed"
}
# Global scraper instance
_scraper = ModernLinkedInScraper()
def scrape_linkedin_profile_modern(linkedin_url: str, method: str = None) -> Dict:
"""
Modern LinkedIn profile scraping function
Args:
linkedin_url: LinkedIn profile URL
method: Preferred scraping method (optional)
Returns:
Dictionary with profile data
"""
result = _scraper.scrape_profile(linkedin_url, method)
# Add metadata
result.data['_scraping_info'] = {
'method': result.method,
'success': result.success,
'cached': result.cached,
'error': result.error
}
return result.data
# LangChain tool
scraper_tool = Tool(
name="Modern LinkedIn Scraper",
func=scrape_linkedin_profile_modern,
description="Advanced LinkedIn profile scraper using multiple methods including Playwright, Selenium, and HTTP requests. No API keys required."
)
# Test function
def test_modern_scraper():
"""Test the modern scraper with various profiles"""
test_urls = [
"https://www.linkedin.com/in/williamhgates/",
"https://www.linkedin.com/in/jeffweiner08/",
"https://www.linkedin.com/in/satyanadella/"
]
print("🧪 Testing Modern LinkedIn Scraper")
print("=" * 50)
for url in test_urls:
print(f"\n🔍 Testing: {url}")
try:
result = scrape_linkedin_profile_modern(url)
print(f"✅ Success! Method: {result.get('_scraping_info', {}).get('method', 'unknown')}")
print(f" Name: {result.get('full_name', 'N/A')}")
print(f" Headline: {result.get('headline', 'N/A')[:50]}...")
except Exception as e:
print(f"❌ Failed: {e}")
if __name__ == "__main__":
test_modern_scraper()