CV-Resume-Parser/debug_preprocessing.py at main · Biraj007/CV-Resume-Parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3

import re
from app.services.intelligent_parser import IntelligentResumeParser
from app.services.pdf_extractor import PDFExtractor

def debug_preprocessing():
    """Debug what preprocessing does to location text"""

    # Initialize parser
    parser = IntelligentResumeParser()

    # Extract text from PDF
    extractor = PDFExtractor()
    with open("test_resume.pdf", "rb") as f:
        pdf_content = f.read()

    text = extractor.extract_text(pdf_content)

    if not text:
        print("ERROR: Could not extract text from PDF")
        return

    print("=== ORIGINAL TEXT (first 30 lines) ===")
    lines = text.split('\n')[:30]
    for i, line in enumerate(lines, 1):
        print(f"{i:2d}: {line}")

    # Find location-related lines
    print("\n=== LOCATION-RELATED LINES ===")
    location_lines = []
    for i, line in enumerate(lines, 1):
        if any(keyword in line.lower() for keyword in ['location', 'address', 'bangalore', 'karnataka']):
            location_lines.append((i, line))
            print(f"{i:2d}: {line}")

    # Test preprocessing on specific lines
    print("\n=== PREPROCESSING TEST ===")
    for line_num, line in location_lines:
        print(f"\nLine {line_num}: '{line}'")

        # Test each preprocessing step
        step1 = re.sub(r'\s+', ' ', line)
        print(f"  Step 1 (whitespace): '{step1}'")

        step2 = re.sub(r'[^\w\s\.\,\-\@\(\)\:\;\+\/]', ' ', step1)
        print(f"  Step 2 (OCR fix): '{step2}'")

        step3 = re.sub(r'\n+', '\n', step2)
        print(f"  Step 3 (line breaks): '{step3}'")

        step4 = re.sub(r'([a-z])([A-Z])', r'\1 \2', step3)
        print(f"  Step 4 (camelCase): '{step4}'")

        step5 = re.sub(r'([A-Z])([A-Z][a-z])', r'\1 \2', step4)
        print(f"  Step 5 (ALLCAPS): '{step5}'")

        final = step5.strip()
        print(f"  Final: '{final}'")

    # Test full preprocessing
    print("\n=== FULL PREPROCESSING ===")
    cleaned_text = parser._preprocess_for_spacy(text)

    print(f"Original text length: {len(text)}")
    print(f"Cleaned text length: {len(cleaned_text)}")

    # Find location in cleaned text
    cleaned_lines = cleaned_text.split('\n')[:30]
    print("\nCleaned text (first 30 lines):")
    for i, line in enumerate(cleaned_lines, 1):
        print(f"{i:2d}: {line}")

    # Search for location patterns in cleaned text
    print("\n=== LOCATION SEARCH IN CLEANED TEXT ===")
    location_patterns = [
        r'Location:\s*(.+)',
        r'Address:\s*(.+)',
        r'Bangalore[,\s]*Karnataka',
        r'Karnataka[,\s]*India'
    ]

    for pattern in location_patterns:
        matches = re.findall(pattern, cleaned_text, re.IGNORECASE)
        if matches:
            print(f"Pattern '{pattern}' found: {matches}")

if __name__ == "__main__":
    debug_preprocessing()