-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr.py
More file actions
32 lines (28 loc) · 952 Bytes
/
ocr.py
File metadata and controls
32 lines (28 loc) · 952 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import pytesseract
from pdf2image import convert_from_path
from docx import Document
def extract_text_from_file(filepath):
ext = os.path.splitext(filepath)[1].lower()
if ext == ".pdf":
return extract_text_from_pdf(filepath)
elif ext == ".docx":
return extract_text_from_docx(filepath)
elif ext in [".png", ".jpg", ".jpeg"]:
return extract_text_from_image(filepath)
elif ext == ".txt":
with open(filepath, "r", encoding="utf-8") as f:
return f.read()
else:
return ""
def extract_text_from_pdf(filepath):
images = convert_from_path(filepath)
text = ""
for img in images:
text += pytesseract.image_to_string(img)
return text
def extract_text_from_docx(filepath):
doc = Document(filepath)
return "\n".join([para.text for para in doc.paragraphs])
def extract_text_from_image(filepath):
return pytesseract.image_to_string(filepath)