From ef240855c4badd26098ae4e4cd910e0287a39d37 Mon Sep 17 00:00:00 2001
From: Karm Soni <karmdsoni8159@gmail.com>
Date: Mon, 2 Mar 2026 17:43:59 +0530
Subject: [PATCH 01/26] feat: integrate docling for document data extraction

---
 pyproject.toml                                  |  1 +
 .../transaction_parser/utils/file_processor.py  | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7d7d91b..a458259 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "rapidfuzz~=3.12.2",
     "pymupdf~=1.26.3",
     "openai",
+    "docling>=2.75.0",
 ]
 
 [build-system]
diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index 8a3ccd9..e95b599 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -3,6 +3,8 @@
 import frappe
 import ocrmypdf
 import pymupdf
+from docling.datamodel.base_models import DocumentStream
+from docling.document_converter import DocumentConverter
 from frappe import _
 from frappe.utils.csvutils import read_csv_content
 from frappe.utils.xlsxutils import (
@@ -14,6 +16,14 @@
 class FileProcessor:
     """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content."""
 
+    def get_doc_stream(self, doc):
+        """Get a DocumentStream for the given file doc."""
+        content = self.get_content(doc)
+        return DocumentStream(
+            name=doc.file_name,
+            content=io.BytesIO(content),
+        )
+
     def get_content(self, doc, page_limit=None):
         if doc.file_type == "PDF":
             return self._process_pdf(doc, page_limit)
@@ -24,10 +34,9 @@ def get_content(self, doc, page_limit=None):
 
     def _process_pdf(self, doc, page_limit=None):
         """Process PDF files with OCR and page limiting."""
-        self.file = io.BytesIO(doc.get_content())
-        self._remove_extra_pages(page_limit)
-        self._apply_ocr()
-        return self._get_text()
+        self.converter = DocumentConverter()
+        result = self.converter.convert(self.get_doc_stream(doc))
+        return result.document.export_to_markdown()
 
     def _process_spreadsheet(self, doc):
         """Process CSV and Excel files."""

From e521e36bdce3e416de6783010d70df41c9c80b60 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Thu, 12 Mar 2026 14:54:59 +0530
Subject: [PATCH 02/26] chore: ensure developer mode is respected in
 transaction parsing queue

---
 transaction_parser/transaction_parser/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transaction_parser/transaction_parser/__init__.py b/transaction_parser/transaction_parser/__init__.py
index 83555ce..abd27da 100644
--- a/transaction_parser/transaction_parser/__init__.py
+++ b/transaction_parser/transaction_parser/__init__.py
@@ -28,6 +28,7 @@ def parse(transaction, country, file_url, ai_model=None, page_limit=None):
         ai_model=cstr(ai_model),
         page_limit=cint(page_limit),
         queue="long",
+        now=frappe.conf.developer_mode,
     )
 
 

From 093955f8d34050e5351231349cd40ab493f4afae Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Thu, 12 Mar 2026 17:50:43 +0530
Subject: [PATCH 03/26] refactor: streamline file processing methods and
 enhance readability

---
 .../utils/file_processor.py                   | 57 +++++++++----------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index e95b599..1ac90e9 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -3,8 +3,6 @@
 import frappe
 import ocrmypdf
 import pymupdf
-from docling.datamodel.base_models import DocumentStream
-from docling.document_converter import DocumentConverter
 from frappe import _
 from frappe.utils.csvutils import read_csv_content
 from frappe.utils.xlsxutils import (
@@ -14,36 +12,33 @@
 
 
 class FileProcessor:
-    """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content."""
-
-    def get_doc_stream(self, doc):
-        """Get a DocumentStream for the given file doc."""
-        content = self.get_content(doc)
-        return DocumentStream(
-            name=doc.file_name,
-            content=io.BytesIO(content),
-        )
+    """
+    Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.
+    """
 
     def get_content(self, doc, page_limit=None):
         if doc.file_type == "PDF":
-            return self._process_pdf(doc, page_limit)
+            return self.process_pdf(doc, page_limit)
         elif doc.file_type in ["CSV", "XLSX", "XLS"]:
-            return self._process_spreadsheet(doc)
+            return self.process_spreadsheet(doc)
         else:
             frappe.throw(_("Only PDF, CSV, and Excel files are supported"))
 
-    def _process_pdf(self, doc, page_limit=None):
+    def process_pdf(self, doc, page_limit=None):
         """Process PDF files with OCR and page limiting."""
-        self.converter = DocumentConverter()
-        result = self.converter.convert(self.get_doc_stream(doc))
-        return result.document.export_to_markdown()
-
-    def _process_spreadsheet(self, doc):
-        """Process CSV and Excel files."""
+        self.file = io.BytesIO(doc.get_content())
+        self.remove_extra_pages(page_limit)
+        self.apply_ocr()
+        return self.get_text()
+
+    def process_spreadsheet(self, doc):
+        """
+        Process CSV and Excel files.
+        """
         file_content = doc.get_content()
 
         if doc.file_type == "CSV":
-            file_content_str = self._decode_csv_content(file_content)
+            file_content_str = self.decode_csv_content(file_content)
             rows = read_csv_content(file_content_str)
         elif doc.file_type == "XLSX":
             rows = read_xlsx_file_from_attached_file(fcontent=file_content)
@@ -51,10 +46,12 @@ def _process_spreadsheet(self, doc):
             rows = read_xls_file_from_attached_file(file_content)
 
         # Convert rows to a formatted string representation
-        return self._format_rows_as_text(rows)
+        return self.format_rows_as_text(rows)
 
-    def _decode_csv_content(self, content):
-        """Decode CSV file content with fallback encodings."""
+    def decode_csv_content(self, content):
+        """
+        Decode CSV file content with fallback encodings.
+        """
         # If content is already a string, return as-is
         if isinstance(content, str):
             return content
@@ -78,8 +75,10 @@ def _decode_csv_content(self, content):
                 )
             )
 
-    def _format_rows_as_text(self, rows):
-        """Convert rows to a text format suitable for AI processing."""
+    def format_rows_as_text(self, rows):
+        """
+        Convert rows to a text format suitable for AI processing.
+        """
         if not rows:
             frappe.throw(_("No data found in the file."))
 
@@ -116,7 +115,7 @@ def _format_rows_as_text(self, rows):
 
         return "\n".join(text_parts)
 
-    def _remove_extra_pages(self, page_limit=None):
+    def remove_extra_pages(self, page_limit=None):
         if not page_limit:
             return
 
@@ -133,7 +132,7 @@ def _remove_extra_pages(self, page_limit=None):
         self.file = temp_file
         self.file.seek(0)
 
-    def _apply_ocr(self):
+    def apply_ocr(self):
         doc = pymupdf.open(stream=self.file, filetype="pdf")
         pages_to_ocr = [
             str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
@@ -159,7 +158,7 @@ def _apply_ocr(self):
         self.file = temp_file
         self.file.seek(0)
 
-    def _get_text(self):
+    def get_text(self):
         text = ""
         doc = pymupdf.open(stream=self.file, filetype="pdf")
         for page in doc:

From cd16721bea292e6f18e11af5303e7b3c6875c546 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Fri, 13 Mar 2026 10:46:40 +0530
Subject: [PATCH 04/26] refactor: enhance type annotations and improve method
 signatures in FileProcessor

---
 .../utils/file_processor.py                   | 33 ++++++++++++-------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index 1ac90e9..b2a80e8 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -4,34 +4,45 @@
 import ocrmypdf
 import pymupdf
 from frappe import _
+from frappe.core.doctype.file.file import File
 from frappe.utils.csvutils import read_csv_content
 from frappe.utils.xlsxutils import (
     read_xls_file_from_attached_file,
     read_xlsx_file_from_attached_file,
 )
 
+# TODO: Make some method static
+# TODO: Remove self.file logic
+# TODO: Add DI like can use OCR or Docling for PDF processing
+
 
 class FileProcessor:
     """
     Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.
     """
 
-    def get_content(self, doc, page_limit=None):
+    def get_content(self, doc: File, page_limit: int | None = None) -> str | None:
         if doc.file_type == "PDF":
             return self.process_pdf(doc, page_limit)
-        elif doc.file_type in ["CSV", "XLSX", "XLS"]:
+
+        if doc.file_type in ("CSV", "XLSX", "XLS"):
             return self.process_spreadsheet(doc)
-        else:
-            frappe.throw(_("Only PDF, CSV, and Excel files are supported"))
 
-    def process_pdf(self, doc, page_limit=None):
-        """Process PDF files with OCR and page limiting."""
+        frappe.throw(
+            title=_("Unsupported File Type"),
+            msg=_("Only PDF, CSV, and Excel files are supported"),
+        )
+
+    def process_pdf(self, doc: File, page_limit: int | None = None) -> str:
+        """
+        Process PDF files with OCR and page limiting.
+        """
         self.file = io.BytesIO(doc.get_content())
         self.remove_extra_pages(page_limit)
         self.apply_ocr()
         return self.get_text()
 
-    def process_spreadsheet(self, doc):
+    def process_spreadsheet(self, doc: File) -> str:
         """
         Process CSV and Excel files.
         """
@@ -48,7 +59,7 @@ def process_spreadsheet(self, doc):
         # Convert rows to a formatted string representation
         return self.format_rows_as_text(rows)
 
-    def decode_csv_content(self, content):
+    def decode_csv_content(self, content: str | bytes) -> str:
         """
         Decode CSV file content with fallback encodings.
         """
@@ -75,7 +86,7 @@ def decode_csv_content(self, content):
                 )
             )
 
-    def format_rows_as_text(self, rows):
+    def format_rows_as_text(self, rows: list) -> str:
         """
         Convert rows to a text format suitable for AI processing.
         """
@@ -115,7 +126,7 @@ def format_rows_as_text(self, rows):
 
         return "\n".join(text_parts)
 
-    def remove_extra_pages(self, page_limit=None):
+    def remove_extra_pages(self, page_limit: int | None = None):
         if not page_limit:
             return
 
@@ -158,7 +169,7 @@ def apply_ocr(self):
         self.file = temp_file
         self.file.seek(0)
 
-    def get_text(self):
+    def get_text(self) -> str:
         text = ""
         doc = pymupdf.open(stream=self.file, filetype="pdf")
         for page in doc:

From fa12e4d3f82731a7c0879c500df6c6673f568d72 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Fri, 13 Mar 2026 10:59:38 +0530
Subject: [PATCH 05/26] refactor: streamline PDF processing methods and enhance
 file handling

---
 .../utils/file_processor.py                   | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index b2a80e8..a859c1f 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -37,10 +37,11 @@ def process_pdf(self, doc: File, page_limit: int | None = None) -> str:
         """
         Process PDF files with OCR and page limiting.
         """
-        self.file = io.BytesIO(doc.get_content())
-        self.remove_extra_pages(page_limit)
-        self.apply_ocr()
-        return self.get_text()
+        file = io.BytesIO(doc.get_content())
+        file = self.trim_pages(file, page_limit)
+        file = self.apply_ocr(file)
+
+        return self.get_text(file)
 
     def process_spreadsheet(self, doc: File) -> str:
         """
@@ -126,11 +127,11 @@ def format_rows_as_text(self, rows: list) -> str:
 
         return "\n".join(text_parts)
 
-    def remove_extra_pages(self, page_limit: int | None = None):
-        if not page_limit:
-            return
+    def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO:
+        if not page_limit or page_limit <= 0:
+            return file
 
-        input_pdf = pymupdf.open(stream=self.file, filetype="pdf")
+        input_pdf = pymupdf.open(stream=file, filetype="pdf")
         output_pdf = pymupdf.open()
         output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
 
@@ -140,25 +141,25 @@ def remove_extra_pages(self, page_limit: int | None = None):
         output_pdf.close()
         input_pdf.close()
 
-        self.file = temp_file
-        self.file.seek(0)
+        temp_file.seek(0)
+        return temp_file
 
-    def apply_ocr(self):
-        doc = pymupdf.open(stream=self.file, filetype="pdf")
+    def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
+        doc = pymupdf.open(stream=file, filetype="pdf")
         pages_to_ocr = [
             str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
         ]
 
         if not pages_to_ocr:
-            return
+            return file
 
         pages = ",".join(pages_to_ocr)
 
         temp_file = io.BytesIO()
-        self.file.seek(0)
+        file.seek(0)
 
         ocrmypdf.ocr(
-            input_file=self.file,
+            input_file=file,
             output_file=temp_file,
             pages=pages,
             progress_bar=False,
@@ -166,12 +167,12 @@ def apply_ocr(self):
             force_ocr=True,
         )
 
-        self.file = temp_file
-        self.file.seek(0)
+        temp_file.seek(0)
+        return temp_file
 
-    def get_text(self) -> str:
+    def get_text(self, file: io.BytesIO) -> str:
         text = ""
-        doc = pymupdf.open(stream=self.file, filetype="pdf")
+        doc = pymupdf.open(stream=file, filetype="pdf")
         for page in doc:
             text += page.get_text("text")
 

From 9628d83c6057a7fcdb9f89b69e2cfec264dca91b Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Fri, 13 Mar 2026 11:02:10 +0530
Subject: [PATCH 06/26] fix: add page limit check for PDF processing in
 FileProcessor

---
 .../transaction_parser/utils/file_processor.py               | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index a859c1f..b9c6f56 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -132,6 +132,11 @@ def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.Byte
             return file
 
         input_pdf = pymupdf.open(stream=file, filetype="pdf")
+
+        if input_pdf.page_count <= page_limit:
+            input_pdf.close()
+            return file
+
         output_pdf = pymupdf.open()
         output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
 

From ae1ea00a2b052865e21038da367e04539410f5d4 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Fri, 13 Mar 2026 12:34:15 +0530
Subject: [PATCH 07/26] feat: add PDF processor selection and integration for
 enhanced document handling

---
 .../transaction_parser_settings.json          |  12 +-
 .../utils/file_processor.py                   |  92 ++--------
 .../transaction_parser/utils/pdf_processor.py | 171 ++++++++++++++++++
 3 files changed, 202 insertions(+), 73 deletions(-)
 create mode 100644 transaction_parser/transaction_parser/utils/pdf_processor.py

diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
index 73a5a16..0071f7c 100644
--- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
+++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
@@ -9,6 +9,7 @@
   "enabled",
   "ai_model_section",
   "default_ai_model",
+  "pdf_processor",
   "api_keys",
   "transaction_configurations_section",
   "invoice_lookback_count",
@@ -92,6 +93,15 @@
    "mandatory_depends_on": "eval: doc.enabled",
    "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash"
   },
+  {
+   "default": "OCR",
+   "depends_on": "eval: doc.enabled",
+   "description": "Select the library to use for PDF text extraction. OCR uses PyMuPDF + OCRmyPDF. Docling provides advanced document understanding.",
+   "fieldname": "pdf_processor",
+   "fieldtype": "Select",
+   "label": "PDF Processor",
+   "options": "OCR\nDocling"
+  },
   {
    "depends_on": "eval: doc.enabled",
    "fieldname": "api_keys",
@@ -177,4 +187,4 @@
  "sort_field": "modified",
  "sort_order": "DESC",
  "states": []
-}
\ No newline at end of file
+}
diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index b9c6f56..6e2d098 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -1,8 +1,6 @@
 import io
 
 import frappe
-import ocrmypdf
-import pymupdf
 from frappe import _
 from frappe.core.doctype.file.file import File
 from frappe.utils.csvutils import read_csv_content
@@ -11,9 +9,10 @@
     read_xlsx_file_from_attached_file,
 )
 
-# TODO: Make some method static
-# TODO: Remove self.file logic
-# TODO: Add DI like can use OCR or Docling for PDF processing
+from transaction_parser.transaction_parser.utils.pdf_processor import (
+    BasePDFProcessor,
+    get_pdf_processor,
+)
 
 
 class FileProcessor:
@@ -21,9 +20,14 @@ class FileProcessor:
     Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.
     """
 
-    def get_content(self, doc: File, page_limit: int | None = None) -> str | None:
+    def get_content(
+        self,
+        doc: File,
+        page_limit: int | None = None,
+        pdf_processor: BasePDFProcessor | None = None,
+    ) -> str | None:
         if doc.file_type == "PDF":
-            return self.process_pdf(doc, page_limit)
+            return self.process_pdf(doc, page_limit, pdf_processor)
 
         if doc.file_type in ("CSV", "XLSX", "XLS"):
             return self.process_spreadsheet(doc)
@@ -33,15 +37,17 @@ def get_content(self, doc: File, page_limit: int | None = None) -> str | None:
             msg=_("Only PDF, CSV, and Excel files are supported"),
         )
 
-    def process_pdf(self, doc: File, page_limit: int | None = None) -> str:
+    def process_pdf(
+        self,
+        doc: File,
+        page_limit: int | None = None,
+        pdf_processor: BasePDFProcessor | None = None,
+    ) -> str:
         """
-        Process PDF files with OCR and page limiting.
+        Process PDF files using the configured PDF processor strategy.
         """
-        file = io.BytesIO(doc.get_content())
-        file = self.trim_pages(file, page_limit)
-        file = self.apply_ocr(file)
-
-        return self.get_text(file)
+        pdf_processor = pdf_processor or get_pdf_processor()
+        return pdf_processor.process(doc, page_limit)
 
     def process_spreadsheet(self, doc: File) -> str:
         """
@@ -126,61 +132,3 @@ def format_rows_as_text(self, rows: list) -> str:
         text_parts.append(f"Total columns: {len(rows[0])}")
 
         return "\n".join(text_parts)
-
-    def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO:
-        if not page_limit or page_limit <= 0:
-            return file
-
-        input_pdf = pymupdf.open(stream=file, filetype="pdf")
-
-        if input_pdf.page_count <= page_limit:
-            input_pdf.close()
-            return file
-
-        output_pdf = pymupdf.open()
-        output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
-
-        temp_file = io.BytesIO()
-        output_pdf.save(temp_file)
-
-        output_pdf.close()
-        input_pdf.close()
-
-        temp_file.seek(0)
-        return temp_file
-
-    def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
-        doc = pymupdf.open(stream=file, filetype="pdf")
-        pages_to_ocr = [
-            str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
-        ]
-
-        if not pages_to_ocr:
-            return file
-
-        pages = ",".join(pages_to_ocr)
-
-        temp_file = io.BytesIO()
-        file.seek(0)
-
-        ocrmypdf.ocr(
-            input_file=file,
-            output_file=temp_file,
-            pages=pages,
-            progress_bar=False,
-            rotate_pages=True,
-            force_ocr=True,
-        )
-
-        temp_file.seek(0)
-        return temp_file
-
-    def get_text(self, file: io.BytesIO) -> str:
-        text = ""
-        doc = pymupdf.open(stream=file, filetype="pdf")
-        for page in doc:
-            text += page.get_text("text")
-
-        doc.close()
-
-        return text
diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
new file mode 100644
index 0000000..426c3b6
--- /dev/null
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -0,0 +1,171 @@
+import io
+from abc import ABC, abstractmethod
+
+import frappe
+import ocrmypdf
+import pymupdf
+from docling.datamodel.base_models import DocumentStream
+from docling.document_converter import DocumentConverter
+from frappe import _
+from frappe.core.doctype.file.file import File
+
+
+class BasePDFProcessor(ABC):
+    """
+    Abstract base class for PDF processors.
+
+    To add a new processor:
+    1. Create a new file in pdf_processors/
+    2. Subclass BasePDFProcessor
+    3. Implement the `process` method
+    4. Register it in pdf_processors/__init__.py PDF_PROCESSORS dict
+    """
+
+    @abstractmethod
+    def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+        """
+        Process a PDF file and return extracted text.
+
+        Args:
+                file: PDF file as BytesIO stream or Frappe File document
+                page_limit: Maximum number of pages to process (None = all pages)
+
+        Returns:
+                Extracted text content from the PDF
+        """
+        ...
+
+    def get_sanitized_file(
+        self, file: io.BytesIO | File, page_limit: int | None = None
+    ) -> io.BytesIO:
+        """Get file as BytesIO stream and trim pages if needed."""
+        if isinstance(file, File):
+            file = io.BytesIO(file.get_content())
+
+        return self.trim_pages(file, page_limit)
+
+    def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO:
+        if not page_limit or page_limit <= 0:
+            return file
+
+        input_pdf = pymupdf.open(stream=file, filetype="pdf")
+
+        if input_pdf.page_count <= page_limit:
+            input_pdf.close()
+            return file
+
+        output_pdf = pymupdf.open()
+        output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
+
+        temp_file = io.BytesIO()
+        output_pdf.save(temp_file)
+
+        output_pdf.close()
+        input_pdf.close()
+
+        temp_file.seek(0)
+        return temp_file
+
+
+class DoclingPDFProcessor(BasePDFProcessor):
+    """
+    PDF processor using Docling for document conversion and text extraction.
+
+    Docling provides advanced document understanding including table detection,
+    formula recognition, reading order detection, and OCR.
+    """
+
+    def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+        file = self.get_sanitized_file(file, page_limit)
+
+        source = DocumentStream(name="document.pdf", stream=file)
+        converter = DocumentConverter()
+        result = converter.convert(source)
+
+        return result.document.export_to_markdown()
+
+
+class OCRPDFProcessor(BasePDFProcessor):
+    """
+    PDF processor using PyMuPDF for text extraction and OCRmyPDF for OCR.
+    """
+
+    def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+        file = self.get_sanitized_file(file, page_limit)
+        file = self.apply_ocr(file)
+
+        return self.get_text(file)
+
+    def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
+        doc = pymupdf.open(stream=file, filetype="pdf")
+        pages_to_ocr = [
+            str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
+        ]
+
+        doc.close()
+
+        if not pages_to_ocr:
+            return file
+
+        pages = ",".join(pages_to_ocr)
+
+        temp_file = io.BytesIO()
+        file.seek(0)
+
+        ocrmypdf.ocr(
+            input_file=file,
+            output_file=temp_file,
+            pages=pages,
+            progress_bar=False,
+            rotate_pages=True,
+            force_ocr=True,
+        )
+
+        temp_file.seek(0)
+        return temp_file
+
+    def get_text(self, file: io.BytesIO) -> str:
+        text = ""
+        doc = pymupdf.open(stream=file, filetype="pdf")
+
+        for page in doc:
+            text += page.get_text("text")
+
+        doc.close()
+
+        return text
+
+
+# Registry: add new processors here
+PDF_PROCESSORS: dict[str, type[BasePDFProcessor]] = {
+    "OCR": OCRPDFProcessor,
+    "Docling": DoclingPDFProcessor,
+}
+
+DEFAULT_PDF_PROCESSOR = "OCR"
+
+
+def get_pdf_processor(name: str | None = None) -> BasePDFProcessor:
+    """
+    Factory function to get a PDF processor by name.
+
+    Usage:
+
+    ```
+    processor = get_pdf_processor("OCR")
+    text = processor.process(file, page_limit=5)
+    ```
+    """
+    name = name or DEFAULT_PDF_PROCESSOR
+
+    processor_class = PDF_PROCESSORS.get(name)
+    if not processor_class:
+        supported = ", ".join(PDF_PROCESSORS.keys())
+        frappe.throw(
+            title=_("Unsupported PDF Processor"),
+            msg=_("PDF Processor '{0}' is not supported. <br>Choose from: {1}").format(
+                name, supported
+            ),
+        )
+
+    return processor_class()

From 5910f00151ba06810a8b3e2319ee8bbb88686616 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Sat, 14 Mar 2026 18:06:02 +0530
Subject: [PATCH 08/26] fix: update default PDF processor to Docling and refine
 description

---
 .../transaction_parser_settings.json                   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
index 0071f7c..09728ac 100644
--- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
+++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
@@ -94,13 +94,13 @@
    "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash"
   },
   {
-   "default": "OCR",
+   "default": "Docling",
    "depends_on": "eval: doc.enabled",
-   "description": "Select the library to use for PDF text extraction. OCR uses PyMuPDF + OCRmyPDF. Docling provides advanced document understanding.",
+   "description": "Select the library to use for PDF text extraction",
    "fieldname": "pdf_processor",
    "fieldtype": "Select",
    "label": "PDF Processor",
-   "options": "OCR\nDocling"
+   "options": "Docling\nOCRmyPDF"
   },
   {
    "depends_on": "eval: doc.enabled",
@@ -166,7 +166,7 @@
  "index_web_pages_for_search": 1,
  "issingle": 1,
  "links": [],
- "modified": "2025-09-08 08:48:58.870032",
+ "modified": "2026-03-14 13:35:17.150533",
  "modified_by": "Administrator",
  "module": "Transaction Parser",
  "name": "Transaction Parser Settings",
@@ -187,4 +187,4 @@
  "sort_field": "modified",
  "sort_order": "DESC",
  "states": []
-}
+}
\ No newline at end of file

From 002b433b9dcdc63ee7a056e70d30db6acd80129e Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Sun, 15 Mar 2026 18:16:45 +0530
Subject: [PATCH 09/26] fix: enhance DoclingPDFProcessor with converter setup
 and add PDF pipeline options

---
 .../transaction_parser/utils/pdf_processor.py   | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 426c3b6..4e78c63 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -5,7 +5,8 @@
 import ocrmypdf
 import pymupdf
 from docling.datamodel.base_models import DocumentStream
-from docling.document_converter import DocumentConverter
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
 from frappe import _
 from frappe.core.doctype.file.file import File
 
@@ -33,7 +34,7 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         Returns:
                 Extracted text content from the PDF
         """
-        ...
+        pass
 
     def get_sanitized_file(
         self, file: io.BytesIO | File, page_limit: int | None = None
@@ -79,11 +80,21 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         file = self.get_sanitized_file(file, page_limit)
 
         source = DocumentStream(name="document.pdf", stream=file)
-        converter = DocumentConverter()
+        converter = self._get_converter()
         result = converter.convert(source)
 
         return result.document.export_to_markdown()
 
+    def _get_converter(self) -> DocumentConverter:
+        pipeline_options = PdfPipelineOptions()
+        pipeline_options.do_ocr = False  # TODO: OCR Setup
+
+        return DocumentConverter(
+            format_options={
+                "pdf": PdfFormatOption(pipeline_options=pipeline_options),
+            }
+        )
+
 
 class OCRPDFProcessor(BasePDFProcessor):
     """

From 3d43a256894bd4889a37441083ef8a44f42641dd Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Mon, 16 Mar 2026 10:44:22 +0530
Subject: [PATCH 10/26] feat: implement PDF processor selection and default
 setting for enhanced document handling

---
 transaction_parser/hooks.py                   |   5 +
 transaction_parser/patches.txt                |   3 +-
 .../patches/set_default_pdf_processor.py      |  11 ++
 .../transaction_parser_settings.json          |   6 +-
 .../utils/file_processor.py                   |   6 +-
 .../transaction_parser/utils/pdf_processor.py | 103 ++++++++++--------
 6 files changed, 83 insertions(+), 51 deletions(-)
 create mode 100644 transaction_parser/patches/set_default_pdf_processor.py

diff --git a/transaction_parser/hooks.py b/transaction_parser/hooks.py
index d389f4f..a944370 100644
--- a/transaction_parser/hooks.py
+++ b/transaction_parser/hooks.py
@@ -27,3 +27,8 @@
         "on_update": "transaction_parser.transaction_parser.overrides.communication.on_update",
     }
 }
+
+pdf_processors = {
+    "OCRMyPDF": "transaction_parser.transaction_parser.utils.pdf_processor.OCRMyPDFProcessor",
+    "Docling": "transaction_parser.transaction_parser.utils.pdf_processor.DoclingPDFProcessor",
+}
diff --git a/transaction_parser/patches.txt b/transaction_parser/patches.txt
index 8096de1..27002f4 100644
--- a/transaction_parser/patches.txt
+++ b/transaction_parser/patches.txt
@@ -4,4 +4,5 @@
 
 [post_model_sync]
 # Patches added in this section will be executed after doctypes are migrated
-execute:from transaction_parser.install import after_install; after_install() #2
\ No newline at end of file
+execute:from transaction_parser.install import after_install; after_install() #2
+transaction_parser.patches.set_default_pdf_processor #1
diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py
new file mode 100644
index 0000000..c3ac3d4
--- /dev/null
+++ b/transaction_parser/patches/set_default_pdf_processor.py
@@ -0,0 +1,11 @@
+import frappe
+
+from transaction_parser.transaction_parser.utils.pdf_processor import (
+    DEFAULT_PDF_PROCESSOR,
+)
+
+
+def execute():
+    frappe.db.set_single_value(
+        "Transaction Parser Settings", "pdf_processor", DEFAULT_PDF_PROCESSOR
+    )
diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
index 09728ac..53dc83c 100644
--- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
+++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
@@ -94,13 +94,13 @@
    "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash"
   },
   {
-   "default": "Docling",
+   "default": "OCRMyPDF",
    "depends_on": "eval: doc.enabled",
    "description": "Select the library to use for PDF text extraction",
    "fieldname": "pdf_processor",
    "fieldtype": "Select",
    "label": "PDF Processor",
-   "options": "Docling\nOCRmyPDF"
+   "options": "OCRMyPDF\nDocling"
   },
   {
    "depends_on": "eval: doc.enabled",
@@ -187,4 +187,4 @@
  "sort_field": "modified",
  "sort_order": "DESC",
  "states": []
-}
\ No newline at end of file
+}
diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index 6e2d098..c73f58b 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -10,7 +10,7 @@
 )
 
 from transaction_parser.transaction_parser.utils.pdf_processor import (
-    BasePDFProcessor,
+    PDFProcessor,
     get_pdf_processor,
 )
 
@@ -24,7 +24,7 @@ def get_content(
         self,
         doc: File,
         page_limit: int | None = None,
-        pdf_processor: BasePDFProcessor | None = None,
+        pdf_processor: PDFProcessor | None = None,
     ) -> str | None:
         if doc.file_type == "PDF":
             return self.process_pdf(doc, page_limit, pdf_processor)
@@ -41,7 +41,7 @@ def process_pdf(
         self,
         doc: File,
         page_limit: int | None = None,
-        pdf_processor: BasePDFProcessor | None = None,
+        pdf_processor: PDFProcessor | None = None,
     ) -> str:
         """
         Process PDF files using the configured PDF processor strategy.
diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 4e78c63..a87237b 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -10,16 +10,24 @@
 from frappe import _
 from frappe.core.doctype.file.file import File
 
+DEFAULT_PDF_PROCESSOR = "OCRMyPDF"
 
-class BasePDFProcessor(ABC):
+
+class PDFProcessor(ABC):
     """
     Abstract base class for PDF processors.
 
-    To add a new processor:
-    1. Create a new file in pdf_processors/
-    2. Subclass BasePDFProcessor
-    3. Implement the `process` method
-    4. Register it in pdf_processors/__init__.py PDF_PROCESSORS dict
+    To add a new processor from another app:
+
+        1. Subclass PDFProcessor
+        2. Implement the `process` method
+        3. Register it via the `pdf_processors` hook in your app's hooks.py:
+
+    ```
+    pdf_processors = {
+        "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+    }
+    ```
     """
 
     @abstractmethod
@@ -28,18 +36,20 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         Process a PDF file and return extracted text.
 
         Args:
-                file: PDF file as BytesIO stream or Frappe File document
-                page_limit: Maximum number of pages to process (None = all pages)
+                        file: PDF file as BytesIO stream or Frappe File document
+                        page_limit: Maximum number of pages to process (None = all pages)
 
         Returns:
-                Extracted text content from the PDF
+                        Extracted text content from the PDF
         """
         pass
 
     def get_sanitized_file(
         self, file: io.BytesIO | File, page_limit: int | None = None
     ) -> io.BytesIO:
-        """Get file as BytesIO stream and trim pages if needed."""
+        """
+        Get file as BytesIO stream and trim pages if needed.
+        """
         if isinstance(file, File):
             file = io.BytesIO(file.get_content())
 
@@ -67,8 +77,19 @@ def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.Byte
         temp_file.seek(0)
         return temp_file
 
+    def get_text(self, file: io.BytesIO) -> str:
+        text = ""
+        doc = pymupdf.open(stream=file, filetype="pdf")
 
-class DoclingPDFProcessor(BasePDFProcessor):
+        for page in doc:
+            text += page.get_text("text")
+
+        doc.close()
+
+        return text
+
+
+class DoclingPDFProcessor(PDFProcessor):
     """
     PDF processor using Docling for document conversion and text extraction.
 
@@ -79,7 +100,7 @@ class DoclingPDFProcessor(BasePDFProcessor):
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
         file = self.get_sanitized_file(file, page_limit)
 
-        source = DocumentStream(name="document.pdf", stream=file)
+        source = DocumentStream(name="document.pdf", stream=file)  # temporary name
         converter = self._get_converter()
         result = converter.convert(source)
 
@@ -96,7 +117,7 @@ def _get_converter(self) -> DocumentConverter:
         )
 
 
-class OCRPDFProcessor(BasePDFProcessor):
+class OCRMyPDFProcessor(PDFProcessor):
     """
     PDF processor using PyMuPDF for text extraction and OCRmyPDF for OCR.
     """
@@ -135,48 +156,42 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
         temp_file.seek(0)
         return temp_file
 
-    def get_text(self, file: io.BytesIO) -> str:
-        text = ""
-        doc = pymupdf.open(stream=file, filetype="pdf")
-
-        for page in doc:
-            text += page.get_text("text")
-
-        doc.close()
-
-        return text
-
 
-# Registry: add new processors here
-PDF_PROCESSORS: dict[str, type[BasePDFProcessor]] = {
-    "OCR": OCRPDFProcessor,
-    "Docling": DoclingPDFProcessor,
-}
-
-DEFAULT_PDF_PROCESSOR = "OCR"
-
-
-def get_pdf_processor(name: str | None = None) -> BasePDFProcessor:
+@frappe.request_cache
+def get_pdf_processor(name: str | None = None) -> PDFProcessor:
     """
     Factory function to get a PDF processor by name.
 
     Usage:
 
-    ```
-    processor = get_pdf_processor("OCR")
-    text = processor.process(file, page_limit=5)
-    ```
+        ```
+        processor = get_pdf_processor("OCR")
+        text = processor.process(file, page_limit=5)
+        ```
+
+    To register a custom processor from another app, add to its hooks.py:
+
+        ```
+        pdf_processors = {
+            "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+        }
+        ```
     """
-    name = name or DEFAULT_PDF_PROCESSOR
+    if not name:
+        name = (
+            frappe.db.get_single_value("Transaction Parser Settings", "pdf_processor")
+            or DEFAULT_PDF_PROCESSOR
+        )
+
+    processors = frappe.get_hooks("pdf_processors") or {}
+    class_path = (processors.get(name) or [None])[-1]
 
-    processor_class = PDF_PROCESSORS.get(name)
-    if not processor_class:
-        supported = ", ".join(PDF_PROCESSORS.keys())
+    if not class_path:
         frappe.throw(
             title=_("Unsupported PDF Processor"),
             msg=_("PDF Processor '{0}' is not supported. <br>Choose from: {1}").format(
-                name, supported
+                name, ", ".join(processors.keys())
             ),
         )
 
-    return processor_class()
+    return frappe.get_attr(class_path)()

From 04fedc32c2aff780b72f5d92e995ec9531e71929 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 10:44:04 +0530
Subject: [PATCH 11/26] fix: refactor DoclingPDFProcessor to import necessary
 modules locally

---
 .../transaction_parser/utils/pdf_processor.py       | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index a87237b..f86824c 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -2,11 +2,7 @@
 from abc import ABC, abstractmethod
 
 import frappe
-import ocrmypdf
 import pymupdf
-from docling.datamodel.base_models import DocumentStream
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
 from frappe import _
 from frappe.core.doctype.file.file import File
 
@@ -98,6 +94,8 @@ class DoclingPDFProcessor(PDFProcessor):
     """
 
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+        from docling.datamodel.base_models import DocumentStream
+
         file = self.get_sanitized_file(file, page_limit)
 
         source = DocumentStream(name="document.pdf", stream=file)  # temporary name
@@ -106,7 +104,10 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
 
         return result.document.export_to_markdown()
 
-    def _get_converter(self) -> DocumentConverter:
+    def _get_converter(self):
+        from docling.datamodel.pipeline_options import PdfPipelineOptions
+        from docling.document_converter import DocumentConverter, PdfFormatOption
+
         pipeline_options = PdfPipelineOptions()
         pipeline_options.do_ocr = False  # TODO: OCR Setup
 
@@ -129,6 +130,8 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         return self.get_text(file)
 
     def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
+        import ocrmypdf
+
         doc = pymupdf.open(stream=file, filetype="pdf")
         pages_to_ocr = [
             str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()

From c0ebb89ac7499cfc669e8a93188d677ee72319d6 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 10:49:20 +0530
Subject: [PATCH 12/26] revert: remove unnecessary request cache decorator from
 get_pdf_processor function

---
 transaction_parser/transaction_parser/utils/pdf_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index f86824c..2a9dbb2 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -160,7 +160,6 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
         return temp_file
 
 
-@frappe.request_cache
 def get_pdf_processor(name: str | None = None) -> PDFProcessor:
     """
     Factory function to get a PDF processor by name.

From 51b54c5263968b843da64a5f4587896db0082f29 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 10:56:21 +0530
Subject: [PATCH 13/26] chore: add comment to clarify processor resolution
 order

---
 transaction_parser/transaction_parser/utils/pdf_processor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 2a9dbb2..169c1af 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -186,6 +186,8 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor:
         )
 
     processors = frappe.get_hooks("pdf_processors") or {}
+
+    # [-1] → last in resolution order app's overrides will take precedence
     class_path = (processors.get(name) or [None])[-1]
 
     if not class_path:

From d15a87b3c3ce2349335d8b3bb7b6168480f905c7 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 11:00:34 +0530
Subject: [PATCH 14/26] fix: optimize DoclingPDFProcessor to use a singleton
 converter instance

---
 .../transaction_parser/utils/pdf_processor.py | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 169c1af..7cc47ba 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -93,6 +93,8 @@ class DoclingPDFProcessor(PDFProcessor):
     formula recognition, reading order detection, and OCR.
     """
 
+    _converter = None
+
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
         from docling.datamodel.base_models import DocumentStream
 
@@ -105,17 +107,20 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         return result.document.export_to_markdown()
 
     def _get_converter(self):
-        from docling.datamodel.pipeline_options import PdfPipelineOptions
-        from docling.document_converter import DocumentConverter, PdfFormatOption
+        if DoclingPDFProcessor._converter is None:
+            from docling.datamodel.pipeline_options import PdfPipelineOptions
+            from docling.document_converter import DocumentConverter, PdfFormatOption
 
-        pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_ocr = False  # TODO: OCR Setup
+            pipeline_options = PdfPipelineOptions()
+            pipeline_options.do_ocr = False  # TODO: OCR Setup
 
-        return DocumentConverter(
-            format_options={
-                "pdf": PdfFormatOption(pipeline_options=pipeline_options),
-            }
-        )
+            DoclingPDFProcessor._converter = DocumentConverter(
+                format_options={
+                    "pdf": PdfFormatOption(pipeline_options=pipeline_options),
+                }
+            )
+
+        return DoclingPDFProcessor._converter
 
 
 class OCRMyPDFProcessor(PDFProcessor):

From 1e9825df9fcde0a938e9ebf6cd570f2e2a87afe9 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 11:22:46 +0530
Subject: [PATCH 15/26] fix: reset file pointer before returning in
 OCRMyPDFProcessor

---
 transaction_parser/transaction_parser/utils/pdf_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 7cc47ba..2334d57 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -145,6 +145,7 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
         doc.close()
 
         if not pages_to_ocr:
+            file.seek(0)
             return file
 
         pages = ",".join(pages_to_ocr)

From 5b5d7f1bc4658c3bdd51f3f8682ffb348ce4f8f9 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 12:19:06 +0530
Subject: [PATCH 16/26] fix: enhance error handling for unsupported spreadsheet
 file types

---
 .../transaction_parser/utils/file_processor.py             | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index c73f58b..7423a74 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -62,6 +62,13 @@ def process_spreadsheet(self, doc: File) -> str:
             rows = read_xlsx_file_from_attached_file(fcontent=file_content)
         elif doc.file_type == "XLS":
             rows = read_xls_file_from_attached_file(file_content)
+        else:
+            frappe.throw(
+                title=_("Unsupported File Type"),
+                msg=_(
+                    "Cannot process spreadsheet with file type: {0}. <br> Supported types are CSV, XLSX, and XLS."
+                ).format(doc.file_type),
+            )
 
         # Convert rows to a formatted string representation
         return self.format_rows_as_text(rows)

From e9004cdec001ae60d976d0a3796fa043003f1b2f Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 12:34:49 +0530
Subject: [PATCH 17/26] chore: fix typo

---
 transaction_parser/transaction_parser/utils/pdf_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 2334d57..203fd52 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -125,7 +125,7 @@ def _get_converter(self):
 
 class OCRMyPDFProcessor(PDFProcessor):
     """
-    PDF processor using PyMuPDF for text extraction and OCRmyPDF for OCR.
+    PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR.
     """
 
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:

From 505f7f129f4daa07c0c69f8b89fcf2dbbfbd1667 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 14:18:59 +0530
Subject: [PATCH 18/26] fix: improve formatting and clarity in PDFProcessor
 documentation

---
 .../transaction_parser/utils/pdf_processor.py | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 203fd52..268d0e8 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -15,14 +15,14 @@ class PDFProcessor(ABC):
 
     To add a new processor from another app:
 
-        1. Subclass PDFProcessor
-        2. Implement the `process` method
-        3. Register it via the `pdf_processors` hook in your app's hooks.py:
+            1. Subclass PDFProcessor
+            2. Implement the `process` method
+            3. Register it via the `pdf_processors` hook in your app's hooks.py:
 
     ```
-    pdf_processors = {
-        "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
-    }
+           pdf_processors = {
+               "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+           }
     ```
     """
 
@@ -32,11 +32,11 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         Process a PDF file and return extracted text.
 
         Args:
-                        file: PDF file as BytesIO stream or Frappe File document
-                        page_limit: Maximum number of pages to process (None = all pages)
+                                        file: PDF file as BytesIO stream or Frappe File document
+                                        page_limit: Maximum number of pages to process (None = all pages)
 
         Returns:
-                        Extracted text content from the PDF
+                                        Extracted text content from the PDF
         """
         pass
 
@@ -108,6 +108,7 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
 
     def _get_converter(self):
         if DoclingPDFProcessor._converter is None:
+            from docling.datamodel.base_models import InputFormat
             from docling.datamodel.pipeline_options import PdfPipelineOptions
             from docling.document_converter import DocumentConverter, PdfFormatOption
 
@@ -116,7 +117,7 @@ def _get_converter(self):
 
             DoclingPDFProcessor._converter = DocumentConverter(
                 format_options={
-                    "pdf": PdfFormatOption(pipeline_options=pipeline_options),
+                    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
                 }
             )
 
@@ -172,18 +173,18 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor:
 
     Usage:
 
-        ```
-        processor = get_pdf_processor("OCR")
-        text = processor.process(file, page_limit=5)
-        ```
+            ```
+                   processor = get_pdf_processor("OCR")
+                   text = processor.process(file, page_limit=5)
+            ```
 
     To register a custom processor from another app, add to its hooks.py:
 
-        ```
-        pdf_processors = {
-            "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
-        }
-        ```
+            ```
+                   pdf_processors = {
+                       "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+                   }
+            ```
     """
     if not name:
         name = (

From fef2bb14a79f73e3e3f90b28547035b8647669d6 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 14:38:19 +0530
Subject: [PATCH 19/26] fix: add check for existing PDF processor setting
 before setting default

---
 transaction_parser/patches/set_default_pdf_processor.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py
index c3ac3d4..3d9fad3 100644
--- a/transaction_parser/patches/set_default_pdf_processor.py
+++ b/transaction_parser/patches/set_default_pdf_processor.py
@@ -6,6 +6,8 @@
 
 
 def execute():
-    frappe.db.set_single_value(
-        "Transaction Parser Settings", "pdf_processor", DEFAULT_PDF_PROCESSOR
-    )
+    DOCTYPE = "Transaction Parser Settings"
+    FIELD = "pdf_processor"
+
+    if not frappe.db.get_single_value(DOCTYPE, FIELD):
+        frappe.db.set_single_value(DOCTYPE, FIELD, DEFAULT_PDF_PROCESSOR)

From fd8d700cf7e48d6f3c682da7eace28b75873f66b Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 14:41:23 +0530
Subject: [PATCH 20/26] fix: enhance error handling in DoclingPDFProcessor for
 conversion status

---
 .../transaction_parser/utils/pdf_processor.py       | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 268d0e8..621ca99 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -96,7 +96,7 @@ class DoclingPDFProcessor(PDFProcessor):
     _converter = None
 
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
-        from docling.datamodel.base_models import DocumentStream
+        from docling.datamodel.base_models import ConversionStatus, DocumentStream
 
         file = self.get_sanitized_file(file, page_limit)
 
@@ -104,6 +104,17 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         converter = self._get_converter()
         result = converter.convert(source)
 
+        if result.status not in (
+            ConversionStatus.SUCCESS,
+            ConversionStatus.PARTIAL_SUCCESS,
+        ):
+            frappe.throw(
+                title=_("PDF Reading Failed"),
+                msg=_("Docling failed to read the document. Status: {0}").format(
+                    result.status
+                ),
+            )
+
         return result.document.export_to_markdown()
 
     def _get_converter(self):

From 426332ec4dd48979d8385d7ea1af725a9bac2546 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 15:13:48 +0530
Subject: [PATCH 21/26] fix: improve CSV content decoding by refining fallback
 encodings

---
 .../transaction_parser/utils/file_processor.py            | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index 7423a74..be1667a 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -82,17 +82,15 @@ def decode_csv_content(self, content: str | bytes) -> str:
             return content
 
         # If content is bytes, decode it
-        encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"]
-
-        for encoding in encodings:
+        for encoding in ("utf-8", "utf-8-sig", "cp1252"):
             try:
                 return content.decode(encoding)
             except UnicodeDecodeError:
                 continue
 
-        # If all encodings fail, try with error handling
+        # Latin-1 never raises; use as final fallback before giving up
         try:
-            return content.decode("utf-8", errors="replace")
+            return content.decode("latin1")
         except Exception:
             frappe.throw(
                 _(

From a47f59a605bc2484455a2058fd76c43fbc37fd9b Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 15:27:48 +0530
Subject: [PATCH 22/26] chore: minor fix

---
 transaction_parser/patches/__init__.py            |  0
 .../transaction_parser/utils/file_processor.py    | 15 ++++++---------
 2 files changed, 6 insertions(+), 9 deletions(-)
 create mode 100644 transaction_parser/patches/__init__.py

diff --git a/transaction_parser/patches/__init__.py b/transaction_parser/patches/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index be1667a..3cebb9b 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -82,21 +82,18 @@ def decode_csv_content(self, content: str | bytes) -> str:
             return content
 
         # If content is bytes, decode it
-        for encoding in ("utf-8", "utf-8-sig", "cp1252"):
+        # ! Note: Always keep `latin1` as the last fallback encoding, as it can decode any byte sequence without errors (Garbage)
+        for encoding in ("utf-8", "utf-8-sig", "cp1252", "latin1"):
             try:
                 return content.decode(encoding)
             except UnicodeDecodeError:
                 continue
 
-        # Latin-1 never raises; use as final fallback before giving up
-        try:
-            return content.decode("latin1")
-        except Exception:
-            frappe.throw(
-                _(
-                    "Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
-                )
+        frappe.throw(
+            _(
+                "Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
             )
+        )
 
     def format_rows_as_text(self, rows: list) -> str:
         """

From b5690d168faafd3134ca37ed22ddb14df4e9460c Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 15:45:58 +0530
Subject: [PATCH 23/26] fix: reset file pointer in trim_pages method for proper
 PDF processing

---
 transaction_parser/transaction_parser/utils/pdf_processor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 621ca99..766c806 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -53,12 +53,14 @@ def get_sanitized_file(
 
     def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO:
         if not page_limit or page_limit <= 0:
+            file.seek(0)
             return file
 
         input_pdf = pymupdf.open(stream=file, filetype="pdf")
 
         if input_pdf.page_count <= page_limit:
             input_pdf.close()
+            file.seek(0)
             return file
 
         output_pdf = pymupdf.open()

From 984bf6f1a986f971e7de5a4f909c10ee030286f3 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 16:13:32 +0530
Subject: [PATCH 24/26] chore: minor change

---
 transaction_parser/transaction_parser/utils/pdf_processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 766c806..be17266 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -157,15 +157,14 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
         ]
 
         doc.close()
+        file.seek(0)
 
         if not pages_to_ocr:
-            file.seek(0)
             return file
 
         pages = ",".join(pages_to_ocr)
 
         temp_file = io.BytesIO()
-        file.seek(0)
 
         ocrmypdf.ocr(
             input_file=file,

From 22b35bc3fd3756eba99bbd85d0f7b538ba0cb8a3 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 16:54:43 +0530
Subject: [PATCH 25/26] fix: improve formatting and readability in PDFProcessor
 documentation

---
 .../transaction_parser/utils/pdf_processor.py | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index be17266..5cfd7d2 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -15,14 +15,14 @@ class PDFProcessor(ABC):
 
     To add a new processor from another app:
 
-            1. Subclass PDFProcessor
-            2. Implement the `process` method
-            3. Register it via the `pdf_processors` hook in your app's hooks.py:
+    1. Subclass PDFProcessor
+    2. Implement the `process` method
+    3. Register it via the `pdf_processors` hook in your app's hooks.py:
 
     ```
-           pdf_processors = {
-               "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
-           }
+    pdf_processors = {
+        "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+    }
     ```
     """
 
@@ -32,11 +32,11 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         Process a PDF file and return extracted text.
 
         Args:
-                                        file: PDF file as BytesIO stream or Frappe File document
-                                        page_limit: Maximum number of pages to process (None = all pages)
+            file: PDF file as BytesIO stream or Frappe File document
+            page_limit: Maximum number of pages to process (None = all pages)
 
         Returns:
-                                        Extracted text content from the PDF
+                Extracted text content from the PDF
         """
         pass
 
@@ -106,15 +106,18 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
         converter = self._get_converter()
         result = converter.convert(source)
 
-        if result.status not in (
-            ConversionStatus.SUCCESS,
-            ConversionStatus.PARTIAL_SUCCESS,
+        if (
+            not result
+            or not result.document
+            or result.status
+            not in (
+                ConversionStatus.SUCCESS,
+                ConversionStatus.PARTIAL_SUCCESS,
+            )
         ):
             frappe.throw(
                 title=_("PDF Reading Failed"),
-                msg=_("Docling failed to read the document. Status: {0}").format(
-                    result.status
-                ),
+                msg=_("Docling failed to read the document."),
             )
 
         return result.document.export_to_markdown()
@@ -185,18 +188,18 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor:
 
     Usage:
 
-            ```
-                   processor = get_pdf_processor("OCR")
-                   text = processor.process(file, page_limit=5)
-            ```
+    ```
+    processor = get_pdf_processor("OCR")
+    text = processor.process(file, page_limit=5)
+    ```
 
     To register a custom processor from another app, add to its hooks.py:
 
-            ```
-                   pdf_processors = {
-                       "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
-                   }
-            ```
+    ```
+    pdf_processors = {
+        "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+    }
+    ```
     """
     if not name:
         name = (

From 9f02aafdc17e75b1f0d0909be48cc6f0514ac6d7 Mon Sep 17 00:00:00 2001
From: Abdeali Chharchhoda <abdealiking786@gmail.com>
Date: Tue, 17 Mar 2026 17:21:18 +0530
Subject: [PATCH 26/26] chore: minor change

---
 transaction_parser/transaction_parser/ai_integration/parser.py | 2 +-
 transaction_parser/transaction_parser/utils/pdf_processor.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/transaction_parser/transaction_parser/ai_integration/parser.py b/transaction_parser/transaction_parser/ai_integration/parser.py
index f74eec2..6db6d3f 100644
--- a/transaction_parser/transaction_parser/ai_integration/parser.py
+++ b/transaction_parser/transaction_parser/ai_integration/parser.py
@@ -132,7 +132,7 @@ def get_api_key(self) -> str:
             _("API Key not found for model {0}").format(self.model.service_provider)
         )
 
-    def get_content(self, response: dict) -> dict | str:
+    def get_content(self, response: dict) -> dict:
         """Extract content from API response."""
         content = response["choices"][0]["message"]["content"]
 
diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
index 5cfd7d2..8d0df88 100644
--- a/transaction_parser/transaction_parser/utils/pdf_processor.py
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -189,7 +189,7 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor:
     Usage:
 
     ```
-    processor = get_pdf_processor("OCR")
+    processor = get_pdf_processor("Docling")
     text = processor.process(file, page_limit=5)
     ```