3Ts_StudyAssistant/local_ocr_processor.py at master · 3TInfoTinker/3Ts_StudyAssistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import pytesseract
from PIL import Image

class LocalOCRProcessor:
    """CPU-based OCR using Tesseract - No API limits!"""

    def extract_text_from_image(self, image_path):
        """Extract text using local Tesseract OCR"""
        try:
            img = Image.open(image_path)

            # Use config for better accuracy with textbooks
            custom_config = r'--oem 3 --psm 6'
            text = pytesseract.image_to_string(img, lang='eng', config=custom_config)

            return text.strip() if text else None

        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            return None

    def process_book_folder(self, folder_path, delay=0, resume=True):
        """Process all images - NO RATE LIMITS!
        Parameters match OCRProcessor for drop-in replacement"""
        extracted_texts = []

        image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')
        image_files = [f for f in os.listdir(folder_path)
                      if f.lower().endswith(image_extensions)]

        image_files.sort()

        print(f"🚀 Processing {len(image_files)} images with local OCR...")
        print("⚡ No rate limits! Processing at full speed!\n")

        for idx, filename in enumerate(image_files, 1):
            image_path = os.path.join(folder_path, filename)
            print(f"[{idx}/{len(image_files)}] {filename}...", end=" ", flush=True)

            text = self.extract_text_from_image(image_path)
            if text:
                extracted_texts.append({
                    'filename': filename,
                    'text': text,
                    'page_number': idx
                })
                print("✅")
            else:
                print("⚠️")

        print(f"\n✅ Processed {len(extracted_texts)} images!")
        return extracted_texts

# Alias for drop-in replacement
OCRProcessor = LocalOCRProcessor

if __name__ == "__main__":
    processor = LocalOCRProcessor()
    texts = processor.process_book_folder("./books")

    if texts:
        print(f"\n--- Sample Output ---")
        print(f"First page: {texts[0]['text'][:300]}...")