resilient-tech · Abdeali099 · Mar 2, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 13, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "rapidfuzz~=3.12.2",
     "pymupdf~=1.26.3",
     "openai",
+    "docling>=2.75.0",
 ]
 
 [build-system]

diff --git a/transaction_parser/hooks.py b/transaction_parser/hooks.py
@@ -27,3 +27,8 @@
         "on_update": "transaction_parser.transaction_parser.overrides.communication.on_update",
     }
 }
+
+pdf_processors = {
+    "OCRMyPDF": "transaction_parser.transaction_parser.utils.pdf_processor.OCRMyPDFProcessor",
+    "Docling": "transaction_parser.transaction_parser.utils.pdf_processor.DoclingPDFProcessor",
+}
diff --git a/transaction_parser/patches.txt b/transaction_parser/patches.txt
@@ -4,4 +4,5 @@
 
 [post_model_sync]
 # Patches added in this section will be executed after doctypes are migrated
-execute:from transaction_parser.install import after_install; after_install() #2
+execute:from transaction_parser.install import after_install; after_install() #2
+transaction_parser.patches.set_default_pdf_processor #1
diff --git a/transaction_parser/patches/__init__.py b/transaction_parser/patches/__init__.py
diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py
@@ -0,0 +1,13 @@
+import frappe
+
+from transaction_parser.transaction_parser.utils.pdf_processor import (
+    DEFAULT_PDF_PROCESSOR,
+)
+
+
+def execute():
+    DOCTYPE = "Transaction Parser Settings"
+    FIELD = "pdf_processor"
+
+    if not frappe.db.get_single_value(DOCTYPE, FIELD):
+        frappe.db.set_single_value(DOCTYPE, FIELD, DEFAULT_PDF_PROCESSOR)
diff --git a/transaction_parser/transaction_parser/__init__.py b/transaction_parser/transaction_parser/__init__.py
@@ -28,6 +28,7 @@ def parse(transaction, country, file_url, ai_model=None, page_limit=None):
         ai_model=cstr(ai_model),
         page_limit=cint(page_limit),
         queue="long",
+        now=frappe.conf.developer_mode,
     )
 
 

diff --git a/transaction_parser/transaction_parser/ai_integration/parser.py b/transaction_parser/transaction_parser/ai_integration/parser.py
@@ -132,7 +132,7 @@ def get_api_key(self) -> str:
             _("API Key not found for model {0}").format(self.model.service_provider)
         )
 
-    def get_content(self, response: dict) -> dict | str:
+    def get_content(self, response: dict) -> dict:
         """Extract content from API response."""
         content = response["choices"][0]["message"]["content"]
 

diff --git a/...r/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/...r/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
@@ -9,6 +9,7 @@
   "enabled",
   "ai_model_section",
   "default_ai_model",
+  "pdf_processor",
   "api_keys",
   "transaction_configurations_section",
   "invoice_lookback_count",
@@ -92,6 +93,15 @@
    "mandatory_depends_on": "eval: doc.enabled",
    "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash"
   },
+  {
+   "default": "OCRMyPDF",
+   "depends_on": "eval: doc.enabled",
+   "description": "Select the library to use for PDF text extraction",
+   "fieldname": "pdf_processor",
+   "fieldtype": "Select",
+   "label": "PDF Processor",
+   "options": "OCRMyPDF\nDocling"
+  },
   {
    "depends_on": "eval: doc.enabled",
    "fieldname": "api_keys",
@@ -156,7 +166,7 @@
  "index_web_pages_for_search": 1,
  "issingle": 1,
  "links": [],
- "modified": "2025-09-08 08:48:58.870032",
+ "modified": "2026-03-14 13:35:17.150533",
  "modified_by": "Administrator",
  "module": "Transaction Parser",
  "name": "Transaction Parser Settings",
@@ -177,4 +187,4 @@
  "sort_field": "modified",
  "sort_order": "DESC",
  "states": []
-}
+}
diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -1,76 +1,104 @@
 import io
 
 import frappe
-import ocrmypdf
-import pymupdf
 from frappe import _
+from frappe.core.doctype.file.file import File
 from frappe.utils.csvutils import read_csv_content
 from frappe.utils.xlsxutils import (
     read_xls_file_from_attached_file,
     read_xlsx_file_from_attached_file,
 )
 
+from transaction_parser.transaction_parser.utils.pdf_processor import (
+    PDFProcessor,
+    get_pdf_processor,
+)
 
-class FileProcessor:
-    """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content."""
 
-    def get_content(self, doc, page_limit=None):
+class FileProcessor:
+    """
+    Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.
+    """
+
+    def get_content(
+        self,
+        doc: File,
+        page_limit: int | None = None,
+        pdf_processor: PDFProcessor | None = None,
+    ) -> str | None:
         if doc.file_type == "PDF":
-            return self._process_pdf(doc, page_limit)
-        elif doc.file_type in ["CSV", "XLSX", "XLS"]:
-            return self._process_spreadsheet(doc)
-        else:
-            frappe.throw(_("Only PDF, CSV, and Excel files are supported"))
+            return self.process_pdf(doc, page_limit, pdf_processor)
 
-    def _process_pdf(self, doc, page_limit=None):
-        """Process PDF files with OCR and page limiting."""
-        self.file = io.BytesIO(doc.get_content())
-        self._remove_extra_pages(page_limit)
-        self._apply_ocr()
-        return self._get_text()
+        if doc.file_type in ("CSV", "XLSX", "XLS"):
+            return self.process_spreadsheet(doc)
 
-    def _process_spreadsheet(self, doc):
-        """Process CSV and Excel files."""
+        frappe.throw(
+            title=_("Unsupported File Type"),
+            msg=_("Only PDF, CSV, and Excel files are supported"),
+        )
+
+    def process_pdf(
+        self,
+        doc: File,
+        page_limit: int | None = None,
+        pdf_processor: PDFProcessor | None = None,
+    ) -> str:
+        """
+        Process PDF files using the configured PDF processor strategy.
+        """
+        pdf_processor = pdf_processor or get_pdf_processor()
+        return pdf_processor.process(doc, page_limit)
+
+    def process_spreadsheet(self, doc: File) -> str:
+        """
+        Process CSV and Excel files.
+        """
         file_content = doc.get_content()
 
         if doc.file_type == "CSV":
-            file_content_str = self._decode_csv_content(file_content)
+            file_content_str = self.decode_csv_content(file_content)
             rows = read_csv_content(file_content_str)
         elif doc.file_type == "XLSX":
             rows = read_xlsx_file_from_attached_file(fcontent=file_content)
         elif doc.file_type == "XLS":
             rows = read_xls_file_from_attached_file(file_content)
+        else:
+            frappe.throw(
+                title=_("Unsupported File Type"),
+                msg=_(
+                    "Cannot process spreadsheet with file type: {0}. <br> Supported types are CSV, XLSX, and XLS."
+                ).format(doc.file_type),
+            )
 
         # Convert rows to a formatted string representation
-        return self._format_rows_as_text(rows)
+        return self.format_rows_as_text(rows)
 
-    def _decode_csv_content(self, content):
-        """Decode CSV file content with fallback encodings."""
+    def decode_csv_content(self, content: str | bytes) -> str:
+        """
+        Decode CSV file content with fallback encodings.
+        """
         # If content is already a string, return as-is
         if isinstance(content, str):
             return content
 
         # If content is bytes, decode it
-        encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"]
-
-        for encoding in encodings:
+        # ! Note: Always keep `latin1` as the last fallback encoding, as it can decode any byte sequence without errors (Garbage)
+        for encoding in ("utf-8", "utf-8-sig", "cp1252", "latin1"):
             try:
                 return content.decode(encoding)
             except UnicodeDecodeError:
                 continue
 
-        # If all encodings fail, try with error handling
-        try:
-            return content.decode("utf-8", errors="replace")
-        except Exception:
-            frappe.throw(
-                _(
-                    "Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
-                )
+        frappe.throw(
+            _(
+                "Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
             )
+        )
 
-    def _format_rows_as_text(self, rows):
-        """Convert rows to a text format suitable for AI processing."""
+    def format_rows_as_text(self, rows: list) -> str:
+        """
+        Convert rows to a text format suitable for AI processing.
+        """
         if not rows:
             frappe.throw(_("No data found in the file."))
 
@@ -106,56 +134,3 @@ def _format_rows_as_text(self, rows):
         text_parts.append(f"Total columns: {len(rows[0])}")
 
         return "\n".join(text_parts)
-
-    def _remove_extra_pages(self, page_limit=None):
-        if not page_limit:
-            return
-
-        input_pdf = pymupdf.open(stream=self.file, filetype="pdf")
-        output_pdf = pymupdf.open()
-        output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
-
-        temp_file = io.BytesIO()
-        output_pdf.save(temp_file)
-
-        output_pdf.close()
-        input_pdf.close()
-
-        self.file = temp_file
-        self.file.seek(0)
-
-    def _apply_ocr(self):
-        doc = pymupdf.open(stream=self.file, filetype="pdf")
-        pages_to_ocr = [
-            str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
-        ]
-
-        if not pages_to_ocr:
-            return
-
-        pages = ",".join(pages_to_ocr)
-
-        temp_file = io.BytesIO()
-        self.file.seek(0)
-
-        ocrmypdf.ocr(
-            input_file=self.file,
-            output_file=temp_file,
-            pages=pages,
-            progress_bar=False,
-            rotate_pages=True,
-            force_ocr=True,
-        )
-
-        self.file = temp_file
-        self.file.seek(0)
-
-    def _get_text(self):
-        text = ""
-        doc = pymupdf.open(stream=self.file, filetype="pdf")
-        for page in doc:
-            text += page.get_text("text")
-
-        doc.close()
-
-        return text