-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_tests.py
More file actions
421 lines (338 loc) · 16.4 KB
/
run_tests.py
File metadata and controls
421 lines (338 loc) · 16.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env python3
"""
Comprehensive Test Suite for Advanced LinkedIn Scraping System
Tests all components: CAPTCHA solving, proxy rotation, OCR, and modern scraping
"""
import os
import sys
import json
import time
from datetime import datetime
from typing import Dict, List
# Add current directory to path for imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from dotenv import load_dotenv
load_dotenv()
class AdvancedScrapingTestSuite:
"""Comprehensive test suite for the advanced scraping system"""
def __init__(self):
self.results = []
self.start_time = None
def log_test(self, test_name: str, success: bool, message: str = "", data: Dict = None):
"""Log test results"""
result = {
"test": test_name,
"success": success,
"message": message,
"timestamp": datetime.now().isoformat(),
"data": data
}
self.results.append(result)
status = "✅ PASS" if success else "❌ FAIL"
print(f"{status} {test_name}: {message}")
def run_all_tests(self):
"""Run comprehensive tests for the advanced scraping system"""
print("🚀 Advanced LinkedIn Scraping System - Test Suite")
print("=" * 70)
self.start_time = time.time()
# Test 1: Configuration validation
self.test_configuration()
# Test 2: CAPTCHA solving system
self.test_captcha_system()
# Test 3: Proxy rotation system
self.test_proxy_system()
# Test 4: OCR capabilities
self.test_ocr_system()
# Test 5: Advanced Scrapy system
self.test_scrapy_system()
# Test 6: Modern scraper integration
self.test_modern_scraper()
# Test 7: Full system integration
self.test_full_integration()
# Generate comprehensive report
self.generate_report()
def test_configuration(self):
"""Test system configuration"""
print("\n🔧 Testing System Configuration")
print("-" * 50)
try:
from scraping_config import config
status = config.validate_config()
# Test LinkedIn authentication
if status['linkedin_auth']:
self.log_test("LinkedIn Authentication", True, f"Configured for {config.LINKEDIN_EMAIL}")
else:
self.log_test("LinkedIn Authentication", False, "No credentials found")
# Test CAPTCHA services
enabled_services = config.get_enabled_captcha_services()
if enabled_services:
self.log_test("CAPTCHA Services", True, f"Enabled: {', '.join(enabled_services)}")
else:
self.log_test("CAPTCHA Services", False, "No CAPTCHA services configured")
# Test scraping methods
enabled_methods = config.get_enabled_scraping_methods()
if enabled_methods:
method_names = [m['name'] for m in enabled_methods]
self.log_test("Scraping Methods", True, f"Enabled: {', '.join(method_names)}")
else:
self.log_test("Scraping Methods", False, "No scraping methods enabled")
except ImportError as e:
self.log_test("Configuration System", False, f"Import error: {e}")
except Exception as e:
self.log_test("Configuration System", False, f"Error: {e}")
def test_captcha_system(self):
"""Test CAPTCHA solving capabilities"""
print("\n🤖 Testing CAPTCHA Solving System")
print("-" * 50)
try:
from captcha_solver import captcha_solver
# Test service availability
balance = captcha_solver.get_balance("2captcha")
if balance is not None:
self.log_test("2captcha Service", True, f"Balance: ${balance}")
else:
self.log_test("2captcha Service", False, "Service unavailable or no API key")
# Test OCR fallback
try:
import pytesseract
from PIL import Image
self.log_test("OCR Fallback", True, "Pytesseract and PIL available")
except ImportError as e:
self.log_test("OCR Fallback", False, f"Missing dependencies: {e}")
except ImportError as e:
self.log_test("CAPTCHA System", False, f"Import error: {e}")
except Exception as e:
self.log_test("CAPTCHA System", False, f"Error: {e}")
def test_proxy_system(self):
"""Test proxy rotation system"""
print("\n📡 Testing Proxy Rotation System")
print("-" * 50)
try:
from scrapy_linkedin_scraper import ProxyRotationMiddleware
proxy_middleware = ProxyRotationMiddleware()
proxy_count = len(proxy_middleware.proxy_list)
if proxy_count > 0:
self.log_test("Proxy Loading", True, f"Loaded {proxy_count} proxies")
# Test proxy rotation
proxy1 = proxy_middleware._get_next_proxy()
proxy2 = proxy_middleware._get_next_proxy()
if proxy1 != proxy2:
self.log_test("Proxy Rotation", True, "Rotation working correctly")
else:
self.log_test("Proxy Rotation", False, "Rotation not working")
else:
self.log_test("Proxy System", False, "No proxies loaded")
except ImportError as e:
self.log_test("Proxy System", False, f"Import error: {e}")
except Exception as e:
self.log_test("Proxy System", False, f"Error: {e}")
def test_ocr_system(self):
"""Test OCR capabilities"""
print("\n🔍 Testing OCR System")
print("-" * 50)
try:
import pytesseract
from PIL import Image
import io
import base64
# Create a simple test image with text
test_image = Image.new('RGB', (200, 50), color='white')
# Test OCR functionality
try:
# This will fail if tesseract is not installed, but that's expected
text = pytesseract.image_to_string(test_image)
self.log_test("OCR Processing", True, "Tesseract processing works")
except Exception as e:
if "tesseract" in str(e).lower():
self.log_test("OCR Processing", False, "Tesseract not installed")
else:
self.log_test("OCR Processing", False, f"OCR error: {e}")
self.log_test("OCR Dependencies", True, "PIL and pytesseract imported successfully")
except ImportError as e:
self.log_test("OCR System", False, f"Missing dependencies: {e}")
except Exception as e:
self.log_test("OCR System", False, f"Error: {e}")
def test_scrapy_system(self):
"""Test advanced Scrapy system"""
print("\n🕷️ Testing Advanced Scrapy System")
print("-" * 50)
try:
from scrapy_linkedin_scraper import LinkedInProfileSpider, RotateUserAgentMiddleware, CaptchaSolverMiddleware
# Test spider initialization
spider = LinkedInProfileSpider()
self.log_test("Scrapy Spider", True, "Spider initialized successfully")
# Test user agent middleware
ua_middleware = RotateUserAgentMiddleware()
if hasattr(ua_middleware, 'user_agent_list') and ua_middleware.user_agent_list:
self.log_test("User Agent Rotation", True, f"{len(ua_middleware.user_agent_list)} agents available")
else:
self.log_test("User Agent Rotation", False, "No user agents loaded")
# Test CAPTCHA middleware
captcha_middleware = CaptchaSolverMiddleware()
if captcha_middleware.solver_available:
self.log_test("CAPTCHA Middleware", True, "CAPTCHA solver integrated")
else:
self.log_test("CAPTCHA Middleware", False, "CAPTCHA solver not available")
except ImportError as e:
self.log_test("Scrapy System", False, f"Import error: {e}")
except Exception as e:
self.log_test("Scrapy System", False, f"Error: {e}")
def test_modern_scraper(self):
"""Test modern scraper integration"""
print("\n🚀 Testing Modern Scraper Integration")
print("-" * 50)
try:
from scraper_modern import scrape_linkedin_profile_modern
test_url = "https://www.linkedin.com/in/liveankit"
print(f"Testing with URL: {test_url}")
result = scrape_linkedin_profile_modern(test_url)
scraping_info = result.get('_scraping_info', {})
method = scraping_info.get('method', 'unknown')
success = scraping_info.get('success', False)
name = result.get('full_name', 'N/A')
if success:
self.log_test("Modern Scraper", True, f"Method: {method}, Name: {name}")
# Check data quality
if name not in ['LinkedIn User', 'Profile Not Accessible', 'LinkedIn Profile']:
self.log_test("Data Quality", True, "Real profile data extracted")
else:
self.log_test("Data Quality", False, "Generic data returned")
else:
self.log_test("Modern Scraper", False, f"Scraping failed with method: {method}")
except ImportError as e:
self.log_test("Modern Scraper", False, f"Import error: {e}")
except Exception as e:
self.log_test("Modern Scraper", False, f"Error: {e}")
def test_full_integration(self):
"""Test full system integration"""
print("\n🎯 Testing Full System Integration")
print("-" * 50)
try:
from agent_modern import generate_profile_summary_and_facts_single_step
test_name = "Satya Nadella"
print(f"Testing full integration with: {test_name}")
result_json = generate_profile_summary_and_facts_single_step(test_name)
# Parse result
if isinstance(result_json, str):
result = json.loads(result_json)
else:
result = result_json
# Validate result structure
required_fields = ['full_name', 'headline', 'summary', 'interesting_facts']
missing_fields = [field for field in required_fields if field not in result]
if not missing_fields:
self.log_test("Result Structure", True, "All required fields present")
# Check data quality
name = result.get('full_name', '')
facts = result.get('interesting_facts', [])
if name and name not in ['LinkedIn User', 'Profile Not Accessible']:
self.log_test("Integration Quality", True, f"Name: {name}, Facts: {len(facts)}")
else:
self.log_test("Integration Quality", False, "Generic data in final result")
else:
self.log_test("Result Structure", False, f"Missing fields: {missing_fields}")
except ImportError as e:
self.log_test("Full Integration", False, f"Import error: {e}")
except Exception as e:
self.log_test("Full Integration", False, f"Error: {e}")
def generate_report(self):
"""Generate comprehensive test report"""
total_time = time.time() - self.start_time
print("\n" + "=" * 70)
print("📊 ADVANCED SCRAPING SYSTEM TEST REPORT")
print("=" * 70)
# Summary statistics
total_tests = len(self.results)
passed_tests = sum(1 for r in self.results if r["success"])
failed_tests = total_tests - passed_tests
success_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
print(f"📈 Total Tests: {total_tests}")
print(f"✅ Passed: {passed_tests}")
print(f"❌ Failed: {failed_tests}")
print(f"📊 Success Rate: {success_rate:.1f}%")
print(f"⏱️ Total Time: {total_time:.2f}s")
# Category breakdown
categories = {}
for result in self.results:
category = result["test"].split(" ")[0]
if category not in categories:
categories[category] = {"passed": 0, "total": 0}
categories[category]["total"] += 1
if result["success"]:
categories[category]["passed"] += 1
print(f"\n📋 Category Breakdown:")
print("-" * 70)
for category, stats in categories.items():
rate = (stats["passed"] / stats["total"]) * 100
print(f"{category:20} {stats['passed']:2}/{stats['total']:2} ({rate:5.1f}%)")
# System health assessment
print(f"\n🏥 System Health Assessment:")
print("-" * 70)
if success_rate >= 90:
health = "Excellent"
emoji = "🎉"
elif success_rate >= 75:
health = "Good"
emoji = "✅"
elif success_rate >= 50:
health = "Fair"
emoji = "⚠️"
else:
health = "Poor"
emoji = "❌"
print(f"{emoji} Overall System Health: {health} ({success_rate:.1f}%)")
# Recommendations
print(f"\n💡 Recommendations:")
print("-" * 70)
if failed_tests == 0:
print("🎉 All systems operational! Your advanced scraping system is ready for production.")
print("✨ Features available:")
print(" • Advanced CAPTCHA solving with third-party services")
print(" • Proxy rotation with health checking")
print(" • OCR fallback for simple CAPTCHAs")
print(" • Scrapy with advanced middlewares")
print(" • Modern browser automation with stealth")
else:
print("🔧 Areas for improvement:")
failed_categories = set()
for result in self.results:
if not result["success"]:
category = result["test"].split(" ")[0]
failed_categories.add(category)
for category in failed_categories:
if category == "CAPTCHA":
print(" • Configure CAPTCHA service API keys (2captcha, Anti-Captcha)")
elif category == "OCR":
print(" • Install Tesseract OCR: https://github.com/tesseract-ocr/tesseract")
elif category == "Proxy":
print(" • Configure proxy list or enable automatic proxy fetching")
elif category == "LinkedIn":
print(" • Verify LinkedIn credentials in .env file")
# Save detailed report
report_data = {
"timestamp": datetime.now().isoformat(),
"version": "advanced_v1.0",
"system_health": health,
"summary": {
"total_tests": total_tests,
"passed_tests": passed_tests,
"failed_tests": failed_tests,
"success_rate": success_rate,
"total_time": total_time
},
"categories": categories,
"results": self.results
}
with open("advanced_scraping_test_report.json", "w") as f:
json.dump(report_data, f, indent=2)
print(f"\n💾 Detailed report saved to: advanced_scraping_test_report.json")
def main():
"""Main test runner"""
print("🚀 Advanced LinkedIn Scraping System - Comprehensive Test Suite")
print("🔧 Testing CAPTCHA solving, proxy rotation, OCR, and modern scraping...")
print()
# Run the comprehensive test suite
suite = AdvancedScrapingTestSuite()
suite.run_all_tests()
if __name__ == "__main__":
main()