-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextraction_ocr.py
More file actions
53 lines (38 loc) · 1.76 KB
/
extraction_ocr.py
File metadata and controls
53 lines (38 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
Author: Ali Vijdaan
Description: UNSTRUCTURED.IO functions
"""
import unstructured
import unstructured.documents
import unstructured.documents.elements
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
#Function for OCR Extraction from PDFs
def ocr_extraction(file_name, output_dir):
raw_elements = partition_pdf(
filename=file_name,
strategy='hi_res',
infer_table_structure=True,
extract_images_in_pdf=True,
extract_image_block_output_dir=output_dir
)
return raw_elements
#Filtering Text Elements
text_element_items = [unstructured.documents.elements.Text,
unstructured.documents.elements.NarrativeText,
unstructured.documents.elements.ListItem,
unstructured.documents.elements.Header,
unstructured.documents.elements.Footer,
unstructured.documents.elements.Title,
unstructured.documents.elements.CompositeElement]
image_element_items = [unstructured.documents.elements.Image]
table_element_items = [unstructured.documents.elements.Table,
unstructured.documents.elements.TableChunk]
def filter_text_elements(raw_elements: list) -> list:
return [ele for ele in raw_elements if type(ele) in text_element_items]
def filter_image_elements(raw_elements: list) -> list:
return [ele for ele in raw_elements if type(ele) in image_element_items]
def filter_table_elements(raw_elements: list) -> list:
return [ele for ele in raw_elements if type(ele) in table_element_items]
def convert_to_json(elements, filename):
elements_to_json(elements, filename=filename)