-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparsing.py
More file actions
97 lines (79 loc) · 3.11 KB
/
parsing.py
File metadata and controls
97 lines (79 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import fitz # PyMuPDF
import pdfplumber
import re
import json
import os
PDF_PATH = "solar.pdf" # path to your PDF
OUTPUT_JSON = "result.json" # JSON output
IMAGE_DIR = "Pics extraction" # directory to save images
# Create image directory if not exists
os.makedirs(IMAGE_DIR, exist_ok=True)
def extract_options(text):
"""
Extract options from text using regex:
Looks for a., b., c., d. patterns
"""
options = {}
matches = re.findall(r"([a-dA-D])\.\s*(.*?)\s*(?=[a-dA-D]\.|$)", text, re.DOTALL)
for letter, option_text in matches:
options[letter.lower()] = option_text.strip()
return options if options else None
def extract_questions_from_text(text):
"""
Split text into questions by numbering (Q1, 1., etc.)
"""
pattern = r"(?:Q\d+|\d+)\.?\s*(.*?)\n(?=(?:Q\d+|\d+)\.?|$)"
questions = re.findall(pattern, text, re.DOTALL)
return [q.strip() for q in questions]
# ---------- MAIN PARSER ----------
def parse_pdf(pdf_path):
data = []
doc = fitz.open(pdf_path)
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(doc):
pdf_page = pdf.pages[page_num]
page_text = page.get_text("text")
questions_text = extract_questions_from_text(page_text)
# images extraction
page_images = []
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
# Convert to RGB if needed
if pix.n > 4 or pix.n == 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_filename = f"{IMAGE_DIR}/page{page_num+1}_img{img_index}.png"
pix.save(img_filename)
pix = None
page_images.append(img_filename)
# table creation
tables = pdf_page.extract_tables()
tables = tables if tables else None
# combine question one after other
if questions_text:
for idx, q_text in enumerate(questions_text, start=1):
question_data = {
"page": page_num + 1,
"question_no": idx,
"text": q_text,
"options": extract_options(q_text),
"table": tables,
"images": page_images if page_images else None
}
data.append(question_data)
else:
data.append({
"page": page_num + 1,
"note": "No clear questions extracted. PDF may not follow Q1./1. pattern."
})
return data
# json file creation
def save_json(data, output_path):
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
# main fun()
if __name__ == "__main__":
parsed_data = parse_pdf(PDF_PATH)
save_json(parsed_data, OUTPUT_JSON)
print(f"✅ Parsing complete! JSON saved to {OUTPUT_JSON}")
print(f"📂 Images saved in folder: {IMAGE_DIR}")