-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_preprocessing.py
More file actions
89 lines (68 loc) · 2.94 KB
/
debug_preprocessing.py
File metadata and controls
89 lines (68 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
import re
from app.services.intelligent_parser import IntelligentResumeParser
from app.services.pdf_extractor import PDFExtractor
def debug_preprocessing():
"""Debug what preprocessing does to location text"""
# Initialize parser
parser = IntelligentResumeParser()
# Extract text from PDF
extractor = PDFExtractor()
with open("test_resume.pdf", "rb") as f:
pdf_content = f.read()
text = extractor.extract_text(pdf_content)
if not text:
print("ERROR: Could not extract text from PDF")
return
print("=== ORIGINAL TEXT (first 30 lines) ===")
lines = text.split('\n')[:30]
for i, line in enumerate(lines, 1):
print(f"{i:2d}: {line}")
# Find location-related lines
print("\n=== LOCATION-RELATED LINES ===")
location_lines = []
for i, line in enumerate(lines, 1):
if any(keyword in line.lower() for keyword in ['location', 'address', 'bangalore', 'karnataka']):
location_lines.append((i, line))
print(f"{i:2d}: {line}")
# Test preprocessing on specific lines
print("\n=== PREPROCESSING TEST ===")
for line_num, line in location_lines:
print(f"\nLine {line_num}: '{line}'")
# Test each preprocessing step
step1 = re.sub(r'\s+', ' ', line)
print(f" Step 1 (whitespace): '{step1}'")
step2 = re.sub(r'[^\w\s\.\,\-\@\(\)\:\;\+\/]', ' ', step1)
print(f" Step 2 (OCR fix): '{step2}'")
step3 = re.sub(r'\n+', '\n', step2)
print(f" Step 3 (line breaks): '{step3}'")
step4 = re.sub(r'([a-z])([A-Z])', r'\1 \2', step3)
print(f" Step 4 (camelCase): '{step4}'")
step5 = re.sub(r'([A-Z])([A-Z][a-z])', r'\1 \2', step4)
print(f" Step 5 (ALLCAPS): '{step5}'")
final = step5.strip()
print(f" Final: '{final}'")
# Test full preprocessing
print("\n=== FULL PREPROCESSING ===")
cleaned_text = parser._preprocess_for_spacy(text)
print(f"Original text length: {len(text)}")
print(f"Cleaned text length: {len(cleaned_text)}")
# Find location in cleaned text
cleaned_lines = cleaned_text.split('\n')[:30]
print("\nCleaned text (first 30 lines):")
for i, line in enumerate(cleaned_lines, 1):
print(f"{i:2d}: {line}")
# Search for location patterns in cleaned text
print("\n=== LOCATION SEARCH IN CLEANED TEXT ===")
location_patterns = [
r'Location:\s*(.+)',
r'Address:\s*(.+)',
r'Bangalore[,\s]*Karnataka',
r'Karnataka[,\s]*India'
]
for pattern in location_patterns:
matches = re.findall(pattern, cleaned_text, re.IGNORECASE)
if matches:
print(f"Pattern '{pattern}' found: {matches}")
if __name__ == "__main__":
debug_preprocessing()