-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlocal_ocr_processor.py
More file actions
64 lines (49 loc) · 2.21 KB
/
Copy pathlocal_ocr_processor.py
File metadata and controls
64 lines (49 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import pytesseract
from PIL import Image
class LocalOCRProcessor:
"""CPU-based OCR using Tesseract - No API limits!"""
def extract_text_from_image(self, image_path):
"""Extract text using local Tesseract OCR"""
try:
img = Image.open(image_path)
# Use config for better accuracy with textbooks
custom_config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(img, lang='eng', config=custom_config)
return text.strip() if text else None
except Exception as e:
print(f"Error processing {image_path}: {e}")
return None
def process_book_folder(self, folder_path, delay=0, resume=True):
"""Process all images - NO RATE LIMITS!
Parameters match OCRProcessor for drop-in replacement"""
extracted_texts = []
image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')
image_files = [f for f in os.listdir(folder_path)
if f.lower().endswith(image_extensions)]
image_files.sort()
print(f"🚀 Processing {len(image_files)} images with local OCR...")
print("⚡ No rate limits! Processing at full speed!\n")
for idx, filename in enumerate(image_files, 1):
image_path = os.path.join(folder_path, filename)
print(f"[{idx}/{len(image_files)}] {filename}...", end=" ", flush=True)
text = self.extract_text_from_image(image_path)
if text:
extracted_texts.append({
'filename': filename,
'text': text,
'page_number': idx
})
print("✅")
else:
print("⚠️")
print(f"\n✅ Processed {len(extracted_texts)} images!")
return extracted_texts
# Alias for drop-in replacement
OCRProcessor = LocalOCRProcessor
if __name__ == "__main__":
processor = LocalOCRProcessor()
texts = processor.process_book_folder("./books")
if texts:
print(f"\n--- Sample Output ---")
print(f"First page: {texts[0]['text'][:300]}...")