Linkedin_AI_Agent/scraper_modern.py at main · LiveWithCodeAnkit/Linkedin_AI_Agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
#!/usr/bin/env python3
"""
Modern LinkedIn Profile Scraper
A powerful, multi-method scraper that uses various techniques to extract LinkedIn profile data.
No external API keys required - uses browser automation and intelligent parsing.
"""

import os
import json
import time
import asyncio
from typing import Dict, Optional, List
from dataclasses import dataclass
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from langchain_core.tools import Tool
from dotenv import load_dotenv

# Import our scraping modules
from scraper_selenium import scrape_linkedin_profile_selenium
from scraper_local import scrape_linkedin_profile_local
from cache import get as cache_get, set as cache_set

load_dotenv()

@dataclass
class ScrapingResult:
    """Structured result from scraping operations"""
    success: bool
    data: Dict
    method: str
    error: Optional[str] = None
    cached: bool = False

class ModernLinkedInScraper:
    """
    Modern LinkedIn scraper with multiple fallback methods:
    1. Local Playwright (persistent session)
    2. Selenium with undetected-chromedriver
    3. HTTP requests with session management
    4. Fallback to public profile data
    """

    def __init__(self):
        self.methods = [
            ("scrapy_advanced", self._scrape_with_scrapy),
            ("ultra_modern", self._scrape_with_ultra_modern),
            ("authenticated_playwright", self._scrape_with_authenticated),
            ("local_playwright", self._scrape_with_playwright),
            ("selenium_undetected", self._scrape_with_selenium),
            ("http_session", self._scrape_with_http),
            ("public_fallback", self._scrape_public_fallback)
        ]

    def scrape_profile(self, linkedin_url: str, preferred_method: str = None) -> ScrapingResult:
        """
        Scrape LinkedIn profile using the best available method

        Args:
            linkedin_url: LinkedIn profile URL
            preferred_method: Preferred scraping method (optional)

        Returns:
            ScrapingResult with profile data
        """
        # DISABLED: Always use fresh scraping, no cache
        # cached_data, age_minutes, from_cache = cache_get(linkedin_url)
        # if cached_data and age_minutes < 60:  # Cache for 1 hour
        #     return ScrapingResult(
        #         success=True,
        #         data=cached_data,
        #         method="cache",
        #         cached=True
        #     )
        print("🔄 Cache disabled - forcing fresh scraping every time")

        # Validate URL
        if not self._is_valid_linkedin_url(linkedin_url):
            return ScrapingResult(
                success=False,
                data={},
                method="validation",
                error="Invalid LinkedIn URL"
            )

        # Try preferred method first if specified
        if preferred_method:
            for method_name, method_func in self.methods:
                if method_name == preferred_method:
                    result = self._try_method(method_name, method_func, linkedin_url)
                    if result.success:
                        cache_set(linkedin_url, result.data)
                        return result

        # Try all methods in order
        for method_name, method_func in self.methods:
            if preferred_method and method_name == preferred_method:
                continue  # Already tried

            print(f"🔄 Trying method: {method_name}")
            result = self._try_method(method_name, method_func, linkedin_url)

            if result.success:
                cache_set(linkedin_url, result.data)
                return result

        # All methods failed
        return ScrapingResult(
            success=False,
            data=self._get_fallback_data(),
            method="fallback",
            error="All scraping methods failed"
        )

    def _try_method(self, method_name: str, method_func, linkedin_url: str) -> ScrapingResult:
        """Try a specific scraping method with error handling"""
        try:
            data = method_func(linkedin_url)
            if data and self._is_valid_profile_data(data):
                return ScrapingResult(
                    success=True,
                    data=self._clean_profile_data(data),
                    method=method_name
                )
            else:
                return ScrapingResult(
                    success=False,
                    data={},
                    method=method_name,
                    error="Invalid or empty profile data"
                )
        except Exception as e:
            return ScrapingResult(
                success=False,
                data={},
                method=method_name,
                error=str(e)
            )

    def _scrape_with_scrapy(self, linkedin_url: str) -> Dict:
        """Scrape using advanced Scrapy with anti-detection middlewares"""
        try:
            # Try to use the existing scrapy_linkedin_scraper if available
            from scrapy_linkedin_scraper import scrape_single_linkedin_profile
            result = scrape_single_linkedin_profile(linkedin_url)
            if result.get('success'):
                return result
            else:
                raise Exception(result.get('error', 'Scrapy scraper failed'))
        except ImportError:
            # Fallback to HTTP method if Scrapy not available
            print("⚠️ Scrapy scraper not available, using HTTP fallback")
            return self._scrape_with_http(linkedin_url)

    def _scrape_with_ultra_modern(self, linkedin_url: str) -> Dict:
        """Scrape using ultra-modern techniques with advanced anti-detection"""
        try:
            # Try to use the existing scrapy_linkedin_scraper as ultra-modern
            from scrapy_linkedin_scraper import scrape_single_linkedin_profile
            result = scrape_single_linkedin_profile(linkedin_url)
            if result.get('success'):
                return result
            else:
                raise Exception(result.get('error', 'Ultra-modern scraper failed'))
        except ImportError:
            # Fallback to HTTP method if ultra-modern not available
            print("⚠️ Ultra-modern scraper not available, using HTTP fallback")
            return self._scrape_with_http(linkedin_url)

    def _scrape_with_authenticated(self, linkedin_url: str) -> Dict:
        """Scrape using authenticated Playwright session"""
        try:
            # Try direct import first
            from scraper_authenticated import scrape_linkedin_authenticated
            return scrape_linkedin_authenticated(linkedin_url)
        except Exception as e:
            if "asyncio loop" in str(e):
                # If async issue, try with a simple fallback
                print("⚠️ Playwright async conflict detected, using fallback method")
                return self._scrape_with_http(linkedin_url)
            else:
                raise Exception(f"Authenticated scraper failed: {str(e)}")

    def _scrape_with_playwright(self, linkedin_url: str) -> Dict:
        """Scrape using Playwright with persistent session"""
        try:
            # Check if local scraper is available
            data = scrape_linkedin_profile_local(linkedin_url)
            if data.get("not_logged_in"):
                raise Exception("Login required for Playwright method")
            return data
        except ImportError:
            raise Exception("Playwright not available")

    def _scrape_with_selenium(self, linkedin_url: str) -> Dict:
        """Scrape using Selenium with undetected Chrome"""
        return scrape_linkedin_profile_selenium(linkedin_url)

    def _scrape_with_http(self, linkedin_url: str) -> Dict:
        """Scrape using HTTP requests with session management"""
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })

        try:
            response = session.get(linkedin_url, timeout=30)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')
            return self._extract_from_html(soup)

        except Exception as e:
            raise Exception(f"HTTP scraping failed: {e}")

    def _scrape_public_fallback(self, linkedin_url: str) -> Dict:
        """Fallback method for public profile information"""
        # Extract username from URL for basic info
        username = self._extract_username_from_url(linkedin_url)

        return {
            "full_name": f"LinkedIn User ({username})",
            "headline": "LinkedIn Professional",
            "summary": f"This is a LinkedIn profile for {username}. Full details require authentication.",
            "profile_pic_url": "",
            "public_profile": True,
            "username": username
        }

    def _extract_from_html(self, soup: BeautifulSoup) -> Dict:
        """Extract profile data from HTML using multiple selectors"""
        data = {}

        # Try OpenGraph meta tags first
        og_title = soup.find('meta', property='og:title')
        og_description = soup.find('meta', property='og:description')
        og_image = soup.find('meta', property='og:image')

        if og_title:
            data['full_name'] = og_title.get('content', '').replace(' | LinkedIn', '').strip()

        if og_description:
            data['headline'] = og_description.get('content', '').strip()
            data['summary'] = data['headline']

        if og_image:
            data['profile_pic_url'] = og_image.get('content', '').strip()

        # Fallback to page title
        if not data.get('full_name'):
            title = soup.find('title')
            if title:
                data['full_name'] = title.get_text().replace(' | LinkedIn', '').strip()

        # Try to find structured data
        json_ld = soup.find('script', type='application/ld+json')
        if json_ld:
            try:
                structured_data = json.loads(json_ld.string)
                if isinstance(structured_data, dict):
                    data.update(self._extract_from_structured_data(structured_data))
            except:
                pass

        return data

    def _extract_from_structured_data(self, structured_data: Dict) -> Dict:
        """Extract data from JSON-LD structured data"""
        extracted = {}

        if structured_data.get('@type') == 'Person':
            extracted['full_name'] = structured_data.get('name', '')
            extracted['headline'] = structured_data.get('jobTitle', '')

            if 'image' in structured_data:
                image = structured_data['image']
                if isinstance(image, dict):
                    extracted['profile_pic_url'] = image.get('url', '')
                elif isinstance(image, str):
                    extracted['profile_pic_url'] = image

        return extracted

    def _is_valid_linkedin_url(self, url: str) -> bool:
        """Validate LinkedIn URL format - supports all LinkedIn domains"""
        try:
            parsed = urlparse(url)
            # Support all LinkedIn domains including country-specific ones
            valid_domains = [
                'www.linkedin.com', 'linkedin.com',
                'in.linkedin.com', 'uk.linkedin.com', 'ca.linkedin.com',
                'au.linkedin.com', 'de.linkedin.com', 'fr.linkedin.com',
                'br.linkedin.com', 'mx.linkedin.com', 'jp.linkedin.com'
            ]
            return (
                parsed.netloc in valid_domains and
                '/in/' in parsed.path
            )
        except:
            return False

    def _extract_username_from_url(self, url: str) -> str:
        """Extract username from LinkedIn URL"""
        try:
            parsed = urlparse(url)
            path_parts = parsed.path.strip('/').split('/')
            if 'in' in path_parts:
                idx = path_parts.index('in')
                if idx + 1 < len(path_parts):
                    return path_parts[idx + 1]
        except:
            pass
        return "unknown"

    def _is_valid_profile_data(self, data: Dict) -> bool:
        """Check if scraped data contains valid profile information"""
        if not isinstance(data, dict):
            return False

        # Check for essential fields
        has_name = bool(data.get('full_name', '').strip())
        has_headline = bool(data.get('headline', '').strip())

        # Must have at least name or headline
        return has_name or has_headline

    def _clean_profile_data(self, data: Dict) -> Dict:
        """Clean and standardize profile data"""
        cleaned = {}

        # Standard fields
        cleaned['full_name'] = data.get('full_name', '').strip()
        cleaned['headline'] = data.get('headline', '').strip()
        cleaned['summary'] = data.get('summary', data.get('headline', '')).strip()
        cleaned['profile_pic_url'] = data.get('profile_pic_url', '').strip()

        # Additional fields if available
        for field in ['location', 'industry', 'connections', 'experience', 'education']:
            if field in data and data[field]:
                cleaned[field] = data[field]

        # Remove empty values
        return {k: v for k, v in cleaned.items() if v}

    def _get_fallback_data(self) -> Dict:
        """Return fallback data when all methods fail"""
        return {
            "full_name": "Profile Not Available",
            "headline": "LinkedIn Profile Access Limited",
            "summary": "This LinkedIn profile could not be accessed. This may be due to privacy settings, login requirements, or network restrictions.",
            "profile_pic_url": "",
            "error": "scraping_failed"
        }

# Global scraper instance
_scraper = ModernLinkedInScraper()

def scrape_linkedin_profile_modern(linkedin_url: str, method: str = None) -> Dict:
    """
    Modern LinkedIn profile scraping function

    Args:
        linkedin_url: LinkedIn profile URL
        method: Preferred scraping method (optional)

    Returns:
        Dictionary with profile data
    """
    result = _scraper.scrape_profile(linkedin_url, method)

    # Add metadata
    result.data['_scraping_info'] = {
        'method': result.method,
        'success': result.success,
        'cached': result.cached,
        'error': result.error
    }

    return result.data

# LangChain tool
scraper_tool = Tool(
    name="Modern LinkedIn Scraper",
    func=scrape_linkedin_profile_modern,
    description="Advanced LinkedIn profile scraper using multiple methods including Playwright, Selenium, and HTTP requests. No API keys required."
)

# Test function
def test_modern_scraper():
    """Test the modern scraper with various profiles"""
    test_urls = [
        "https://www.linkedin.com/in/williamhgates/",
        "https://www.linkedin.com/in/jeffweiner08/",
        "https://www.linkedin.com/in/satyanadella/"
    ]

    print("🧪 Testing Modern LinkedIn Scraper")
    print("=" * 50)

    for url in test_urls:
        print(f"\n🔍 Testing: {url}")
        try:
            result = scrape_linkedin_profile_modern(url)
            print(f"✅ Success! Method: {result.get('_scraping_info', {}).get('method', 'unknown')}")
            print(f"   Name: {result.get('full_name', 'N/A')}")
            print(f"   Headline: {result.get('headline', 'N/A')[:50]}...")
        except Exception as e:
            print(f"❌ Failed: {e}")

if __name__ == "__main__":
    test_modern_scraper()