-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathPDFMinerLibrary.py
More file actions
51 lines (46 loc) · 2.57 KB
/
PDFMinerLibrary.py
File metadata and controls
51 lines (46 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
from typing import List
def find_row(pdf_path: str, search_text: str) -> List[str]:
"""
Find all elements from the same row by matching the coordinates.
"""
for page_layout in extract_pages(pdf_path):
horizontal_text_box_elements = [element for element in page_layout if isinstance(element, LTTextBoxHorizontal)]
search_elements = [element for element in horizontal_text_box_elements if search_text in element.get_text()]
for search_element in search_elements:
x0, y0, _, y1 = search_element.bbox
row_elements = []
# match all elements
for element in horizontal_text_box_elements:
# add the element we are using to search
if element == search_element:
row_elements.append(element.get_text().strip())
continue
ex0, ey0, _, ey1 = element.bbox
# Check if the element is at the same y-coordinate and after the the search element in the x-coordinate
if (ey0 == y0 and ey1 == y1 and ex0 > x0):
row_elements.append(element.get_text().strip())
# If match is found return the row elements otherwise just continue to the next page
if len(row_elements) > 0:
return row_elements
def find_column(pdf_path: str, search_text: str):
columns_elements = []
for page_layout in extract_pages(pdf_path):
horizontal_text_box_elements = [element for element in page_layout if isinstance(element, LTTextBoxHorizontal)]
search_elements = [element for element in horizontal_text_box_elements if search_text in element.get_text()]
for search_element in search_elements:
x0, ey0, x1, _ = search_element.bbox
# match all elements
for element in horizontal_text_box_elements:
# add the element we are using to search
if element == search_element:
columns_elements.append(element.get_text().strip())
continue
ex0, y0, ex1, _ = element.bbox
# Check if the element is at the same x-coordinate (give or take)
if (ex0 >= x0 and ex1 <= (x1 + 1) and ey0 > y0):
columns_elements.append(element.get_text().strip())
# If match is found return the column elements otherwise just continue to the next page
if len(columns_elements) > 0:
return columns_elements