diff --git a/pyproject.toml b/pyproject.toml
index 7d7d91b..a458259 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
"rapidfuzz~=3.12.2",
"pymupdf~=1.26.3",
"openai",
+ "docling>=2.75.0",
]
[build-system]
diff --git a/transaction_parser/hooks.py b/transaction_parser/hooks.py
index d389f4f..a944370 100644
--- a/transaction_parser/hooks.py
+++ b/transaction_parser/hooks.py
@@ -27,3 +27,8 @@
"on_update": "transaction_parser.transaction_parser.overrides.communication.on_update",
}
}
+
+pdf_processors = {
+ "OCRMyPDF": "transaction_parser.transaction_parser.utils.pdf_processor.OCRMyPDFProcessor",
+ "Docling": "transaction_parser.transaction_parser.utils.pdf_processor.DoclingPDFProcessor",
+}
diff --git a/transaction_parser/patches.txt b/transaction_parser/patches.txt
index 8096de1..27002f4 100644
--- a/transaction_parser/patches.txt
+++ b/transaction_parser/patches.txt
@@ -4,4 +4,5 @@
[post_model_sync]
# Patches added in this section will be executed after doctypes are migrated
-execute:from transaction_parser.install import after_install; after_install() #2
\ No newline at end of file
+execute:from transaction_parser.install import after_install; after_install() #2
+transaction_parser.patches.set_default_pdf_processor #1
diff --git a/transaction_parser/patches/__init__.py b/transaction_parser/patches/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py
new file mode 100644
index 0000000..3d9fad3
--- /dev/null
+++ b/transaction_parser/patches/set_default_pdf_processor.py
@@ -0,0 +1,13 @@
+import frappe
+
+from transaction_parser.transaction_parser.utils.pdf_processor import (
+ DEFAULT_PDF_PROCESSOR,
+)
+
+
+def execute():
+ DOCTYPE = "Transaction Parser Settings"
+ FIELD = "pdf_processor"
+
+ if not frappe.db.get_single_value(DOCTYPE, FIELD):
+ frappe.db.set_single_value(DOCTYPE, FIELD, DEFAULT_PDF_PROCESSOR)
diff --git a/transaction_parser/transaction_parser/__init__.py b/transaction_parser/transaction_parser/__init__.py
index 83555ce..abd27da 100644
--- a/transaction_parser/transaction_parser/__init__.py
+++ b/transaction_parser/transaction_parser/__init__.py
@@ -28,6 +28,7 @@ def parse(transaction, country, file_url, ai_model=None, page_limit=None):
ai_model=cstr(ai_model),
page_limit=cint(page_limit),
queue="long",
+ now=frappe.conf.developer_mode,
)
diff --git a/transaction_parser/transaction_parser/ai_integration/parser.py b/transaction_parser/transaction_parser/ai_integration/parser.py
index f74eec2..6db6d3f 100644
--- a/transaction_parser/transaction_parser/ai_integration/parser.py
+++ b/transaction_parser/transaction_parser/ai_integration/parser.py
@@ -132,7 +132,7 @@ def get_api_key(self) -> str:
_("API Key not found for model {0}").format(self.model.service_provider)
)
- def get_content(self, response: dict) -> dict | str:
+ def get_content(self, response: dict) -> dict:
"""Extract content from API response."""
content = response["choices"][0]["message"]["content"]
diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
index 73a5a16..53dc83c 100644
--- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
+++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json
@@ -9,6 +9,7 @@
"enabled",
"ai_model_section",
"default_ai_model",
+ "pdf_processor",
"api_keys",
"transaction_configurations_section",
"invoice_lookback_count",
@@ -92,6 +93,15 @@
"mandatory_depends_on": "eval: doc.enabled",
"options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash"
},
+ {
+ "default": "OCRMyPDF",
+ "depends_on": "eval: doc.enabled",
+ "description": "Select the library to use for PDF text extraction",
+ "fieldname": "pdf_processor",
+ "fieldtype": "Select",
+ "label": "PDF Processor",
+ "options": "OCRMyPDF\nDocling"
+ },
{
"depends_on": "eval: doc.enabled",
"fieldname": "api_keys",
@@ -156,7 +166,7 @@
"index_web_pages_for_search": 1,
"issingle": 1,
"links": [],
- "modified": "2025-09-08 08:48:58.870032",
+ "modified": "2026-03-14 13:35:17.150533",
"modified_by": "Administrator",
"module": "Transaction Parser",
"name": "Transaction Parser Settings",
@@ -177,4 +187,4 @@
"sort_field": "modified",
"sort_order": "DESC",
"states": []
-}
\ No newline at end of file
+}
diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py
index 8a3ccd9..3cebb9b 100644
--- a/transaction_parser/transaction_parser/utils/file_processor.py
+++ b/transaction_parser/transaction_parser/utils/file_processor.py
@@ -1,76 +1,104 @@
import io
import frappe
-import ocrmypdf
-import pymupdf
from frappe import _
+from frappe.core.doctype.file.file import File
from frappe.utils.csvutils import read_csv_content
from frappe.utils.xlsxutils import (
read_xls_file_from_attached_file,
read_xlsx_file_from_attached_file,
)
+from transaction_parser.transaction_parser.utils.pdf_processor import (
+ PDFProcessor,
+ get_pdf_processor,
+)
-class FileProcessor:
- """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content."""
- def get_content(self, doc, page_limit=None):
+class FileProcessor:
+ """
+ Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.
+ """
+
+ def get_content(
+ self,
+ doc: File,
+ page_limit: int | None = None,
+ pdf_processor: PDFProcessor | None = None,
+ ) -> str | None:
if doc.file_type == "PDF":
- return self._process_pdf(doc, page_limit)
- elif doc.file_type in ["CSV", "XLSX", "XLS"]:
- return self._process_spreadsheet(doc)
- else:
- frappe.throw(_("Only PDF, CSV, and Excel files are supported"))
+ return self.process_pdf(doc, page_limit, pdf_processor)
- def _process_pdf(self, doc, page_limit=None):
- """Process PDF files with OCR and page limiting."""
- self.file = io.BytesIO(doc.get_content())
- self._remove_extra_pages(page_limit)
- self._apply_ocr()
- return self._get_text()
+ if doc.file_type in ("CSV", "XLSX", "XLS"):
+ return self.process_spreadsheet(doc)
- def _process_spreadsheet(self, doc):
- """Process CSV and Excel files."""
+ frappe.throw(
+ title=_("Unsupported File Type"),
+ msg=_("Only PDF, CSV, and Excel files are supported"),
+ )
+
+ def process_pdf(
+ self,
+ doc: File,
+ page_limit: int | None = None,
+ pdf_processor: PDFProcessor | None = None,
+ ) -> str:
+ """
+ Process PDF files using the configured PDF processor strategy.
+ """
+ pdf_processor = pdf_processor or get_pdf_processor()
+ return pdf_processor.process(doc, page_limit)
+
+ def process_spreadsheet(self, doc: File) -> str:
+ """
+ Process CSV and Excel files.
+ """
file_content = doc.get_content()
if doc.file_type == "CSV":
- file_content_str = self._decode_csv_content(file_content)
+ file_content_str = self.decode_csv_content(file_content)
rows = read_csv_content(file_content_str)
elif doc.file_type == "XLSX":
rows = read_xlsx_file_from_attached_file(fcontent=file_content)
elif doc.file_type == "XLS":
rows = read_xls_file_from_attached_file(file_content)
+ else:
+ frappe.throw(
+ title=_("Unsupported File Type"),
+ msg=_(
+ "Cannot process spreadsheet with file type: {0}.
Supported types are CSV, XLSX, and XLS."
+ ).format(doc.file_type),
+ )
# Convert rows to a formatted string representation
- return self._format_rows_as_text(rows)
+ return self.format_rows_as_text(rows)
- def _decode_csv_content(self, content):
- """Decode CSV file content with fallback encodings."""
+ def decode_csv_content(self, content: str | bytes) -> str:
+ """
+ Decode CSV file content with fallback encodings.
+ """
# If content is already a string, return as-is
if isinstance(content, str):
return content
# If content is bytes, decode it
- encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"]
-
- for encoding in encodings:
+ # ! Note: Always keep `latin1` as the last fallback encoding, as it can decode any byte sequence without errors (Garbage)
+ for encoding in ("utf-8", "utf-8-sig", "cp1252", "latin1"):
try:
return content.decode(encoding)
except UnicodeDecodeError:
continue
- # If all encodings fail, try with error handling
- try:
- return content.decode("utf-8", errors="replace")
- except Exception:
- frappe.throw(
- _(
- "Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
- )
+ frappe.throw(
+ _(
+ "Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
)
+ )
- def _format_rows_as_text(self, rows):
- """Convert rows to a text format suitable for AI processing."""
+ def format_rows_as_text(self, rows: list) -> str:
+ """
+ Convert rows to a text format suitable for AI processing.
+ """
if not rows:
frappe.throw(_("No data found in the file."))
@@ -106,56 +134,3 @@ def _format_rows_as_text(self, rows):
text_parts.append(f"Total columns: {len(rows[0])}")
return "\n".join(text_parts)
-
- def _remove_extra_pages(self, page_limit=None):
- if not page_limit:
- return
-
- input_pdf = pymupdf.open(stream=self.file, filetype="pdf")
- output_pdf = pymupdf.open()
- output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
-
- temp_file = io.BytesIO()
- output_pdf.save(temp_file)
-
- output_pdf.close()
- input_pdf.close()
-
- self.file = temp_file
- self.file.seek(0)
-
- def _apply_ocr(self):
- doc = pymupdf.open(stream=self.file, filetype="pdf")
- pages_to_ocr = [
- str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
- ]
-
- if not pages_to_ocr:
- return
-
- pages = ",".join(pages_to_ocr)
-
- temp_file = io.BytesIO()
- self.file.seek(0)
-
- ocrmypdf.ocr(
- input_file=self.file,
- output_file=temp_file,
- pages=pages,
- progress_bar=False,
- rotate_pages=True,
- force_ocr=True,
- )
-
- self.file = temp_file
- self.file.seek(0)
-
- def _get_text(self):
- text = ""
- doc = pymupdf.open(stream=self.file, filetype="pdf")
- for page in doc:
- text += page.get_text("text")
-
- doc.close()
-
- return text
diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
new file mode 100644
index 0000000..8d0df88
--- /dev/null
+++ b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -0,0 +1,223 @@
+import io
+from abc import ABC, abstractmethod
+
+import frappe
+import pymupdf
+from frappe import _
+from frappe.core.doctype.file.file import File
+
+DEFAULT_PDF_PROCESSOR = "OCRMyPDF"
+
+
+class PDFProcessor(ABC):
+ """
+ Abstract base class for PDF processors.
+
+ To add a new processor from another app:
+
+ 1. Subclass PDFProcessor
+ 2. Implement the `process` method
+ 3. Register it via the `pdf_processors` hook in your app's hooks.py:
+
+ ```
+ pdf_processors = {
+ "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+ }
+ ```
+ """
+
+ @abstractmethod
+ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+ """
+ Process a PDF file and return extracted text.
+
+ Args:
+ file: PDF file as BytesIO stream or Frappe File document
+ page_limit: Maximum number of pages to process (None = all pages)
+
+ Returns:
+ Extracted text content from the PDF
+ """
+ pass
+
+ def get_sanitized_file(
+ self, file: io.BytesIO | File, page_limit: int | None = None
+ ) -> io.BytesIO:
+ """
+ Get file as BytesIO stream and trim pages if needed.
+ """
+ if isinstance(file, File):
+ file = io.BytesIO(file.get_content())
+
+ return self.trim_pages(file, page_limit)
+
+ def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO:
+ if not page_limit or page_limit <= 0:
+ file.seek(0)
+ return file
+
+ input_pdf = pymupdf.open(stream=file, filetype="pdf")
+
+ if input_pdf.page_count <= page_limit:
+ input_pdf.close()
+ file.seek(0)
+ return file
+
+ output_pdf = pymupdf.open()
+ output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)
+
+ temp_file = io.BytesIO()
+ output_pdf.save(temp_file)
+
+ output_pdf.close()
+ input_pdf.close()
+
+ temp_file.seek(0)
+ return temp_file
+
+ def get_text(self, file: io.BytesIO) -> str:
+ text = ""
+ doc = pymupdf.open(stream=file, filetype="pdf")
+
+ for page in doc:
+ text += page.get_text("text")
+
+ doc.close()
+
+ return text
+
+
+class DoclingPDFProcessor(PDFProcessor):
+ """
+ PDF processor using Docling for document conversion and text extraction.
+
+ Docling provides advanced document understanding including table detection,
+ formula recognition, reading order detection, and OCR.
+ """
+
+ _converter = None
+
+ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+ from docling.datamodel.base_models import ConversionStatus, DocumentStream
+
+ file = self.get_sanitized_file(file, page_limit)
+
+ source = DocumentStream(name="document.pdf", stream=file) # temporary name
+ converter = self._get_converter()
+ result = converter.convert(source)
+
+ if (
+ not result
+ or not result.document
+ or result.status
+ not in (
+ ConversionStatus.SUCCESS,
+ ConversionStatus.PARTIAL_SUCCESS,
+ )
+ ):
+ frappe.throw(
+ title=_("PDF Reading Failed"),
+ msg=_("Docling failed to read the document."),
+ )
+
+ return result.document.export_to_markdown()
+
+ def _get_converter(self):
+ if DoclingPDFProcessor._converter is None:
+ from docling.datamodel.base_models import InputFormat
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
+ from docling.document_converter import DocumentConverter, PdfFormatOption
+
+ pipeline_options = PdfPipelineOptions()
+ pipeline_options.do_ocr = False # TODO: OCR Setup
+
+ DoclingPDFProcessor._converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+ }
+ )
+
+ return DoclingPDFProcessor._converter
+
+
+class OCRMyPDFProcessor(PDFProcessor):
+ """
+ PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR.
+ """
+
+ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
+ file = self.get_sanitized_file(file, page_limit)
+ file = self.apply_ocr(file)
+
+ return self.get_text(file)
+
+ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
+ import ocrmypdf
+
+ doc = pymupdf.open(stream=file, filetype="pdf")
+ pages_to_ocr = [
+ str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
+ ]
+
+ doc.close()
+ file.seek(0)
+
+ if not pages_to_ocr:
+ return file
+
+ pages = ",".join(pages_to_ocr)
+
+ temp_file = io.BytesIO()
+
+ ocrmypdf.ocr(
+ input_file=file,
+ output_file=temp_file,
+ pages=pages,
+ progress_bar=False,
+ rotate_pages=True,
+ force_ocr=True,
+ )
+
+ temp_file.seek(0)
+ return temp_file
+
+
+def get_pdf_processor(name: str | None = None) -> PDFProcessor:
+ """
+ Factory function to get a PDF processor by name.
+
+ Usage:
+
+ ```
+ processor = get_pdf_processor("Docling")
+ text = processor.process(file, page_limit=5)
+ ```
+
+ To register a custom processor from another app, add to its hooks.py:
+
+ ```
+ pdf_processors = {
+ "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor",
+ }
+ ```
+ """
+ if not name:
+ name = (
+ frappe.db.get_single_value("Transaction Parser Settings", "pdf_processor")
+ or DEFAULT_PDF_PROCESSOR
+ )
+
+ processors = frappe.get_hooks("pdf_processors") or {}
+
+ # [-1] → last in resolution order app's overrides will take precedence
+ class_path = (processors.get(name) or [None])[-1]
+
+ if not class_path:
+ frappe.throw(
+ title=_("Unsupported PDF Processor"),
+ msg=_("PDF Processor '{0}' is not supported.
Choose from: {1}").format(
+ name, ", ".join(processors.keys())
+ ),
+ )
+
+ return frappe.get_attr(class_path)()