From ef240855c4badd26098ae4e4cd910e0287a39d37 Mon Sep 17 00:00:00 2001 From: Karm Soni Date: Mon, 2 Mar 2026 17:43:59 +0530 Subject: [PATCH 01/26] feat: integrate docling for document data extraction --- pyproject.toml | 1 + .../transaction_parser/utils/file_processor.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7d7d91b..a458259 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "rapidfuzz~=3.12.2", "pymupdf~=1.26.3", "openai", + "docling>=2.75.0", ] [build-system] diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index 8a3ccd9..e95b599 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -3,6 +3,8 @@ import frappe import ocrmypdf import pymupdf +from docling.datamodel.base_models import DocumentStream +from docling.document_converter import DocumentConverter from frappe import _ from frappe.utils.csvutils import read_csv_content from frappe.utils.xlsxutils import ( @@ -14,6 +16,14 @@ class FileProcessor: """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.""" + def get_doc_stream(self, doc): + """Get a DocumentStream for the given file doc.""" + content = self.get_content(doc) + return DocumentStream( + name=doc.file_name, + content=io.BytesIO(content), + ) + def get_content(self, doc, page_limit=None): if doc.file_type == "PDF": return self._process_pdf(doc, page_limit) @@ -24,10 +34,9 @@ def get_content(self, doc, page_limit=None): def _process_pdf(self, doc, page_limit=None): """Process PDF files with OCR and page limiting.""" - self.file = io.BytesIO(doc.get_content()) - self._remove_extra_pages(page_limit) - self._apply_ocr() - return self._get_text() + self.converter = DocumentConverter() + result = self.converter.convert(self.get_doc_stream(doc)) + return result.document.export_to_markdown() def _process_spreadsheet(self, doc): """Process CSV and Excel files.""" From e521e36bdce3e416de6783010d70df41c9c80b60 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Thu, 12 Mar 2026 14:54:59 +0530 Subject: [PATCH 02/26] chore: ensure developer mode is respected in transaction parsing queue --- transaction_parser/transaction_parser/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transaction_parser/transaction_parser/__init__.py b/transaction_parser/transaction_parser/__init__.py index 83555ce..abd27da 100644 --- a/transaction_parser/transaction_parser/__init__.py +++ b/transaction_parser/transaction_parser/__init__.py @@ -28,6 +28,7 @@ def parse(transaction, country, file_url, ai_model=None, page_limit=None): ai_model=cstr(ai_model), page_limit=cint(page_limit), queue="long", + now=frappe.conf.developer_mode, ) From 093955f8d34050e5351231349cd40ab493f4afae Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Thu, 12 Mar 2026 17:50:43 +0530 Subject: [PATCH 03/26] refactor: streamline file processing methods and enhance readability --- .../utils/file_processor.py | 57 +++++++++---------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index e95b599..1ac90e9 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -3,8 +3,6 @@ import frappe import ocrmypdf import pymupdf -from docling.datamodel.base_models import DocumentStream -from docling.document_converter import DocumentConverter from frappe import _ from frappe.utils.csvutils import read_csv_content from frappe.utils.xlsxutils import ( @@ -14,36 +12,33 @@ class FileProcessor: - """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.""" - - def get_doc_stream(self, doc): - """Get a DocumentStream for the given file doc.""" - content = self.get_content(doc) - return DocumentStream( - name=doc.file_name, - content=io.BytesIO(content), - ) + """ + Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content. + """ def get_content(self, doc, page_limit=None): if doc.file_type == "PDF": - return self._process_pdf(doc, page_limit) + return self.process_pdf(doc, page_limit) elif doc.file_type in ["CSV", "XLSX", "XLS"]: - return self._process_spreadsheet(doc) + return self.process_spreadsheet(doc) else: frappe.throw(_("Only PDF, CSV, and Excel files are supported")) - def _process_pdf(self, doc, page_limit=None): + def process_pdf(self, doc, page_limit=None): """Process PDF files with OCR and page limiting.""" - self.converter = DocumentConverter() - result = self.converter.convert(self.get_doc_stream(doc)) - return result.document.export_to_markdown() - - def _process_spreadsheet(self, doc): - """Process CSV and Excel files.""" + self.file = io.BytesIO(doc.get_content()) + self.remove_extra_pages(page_limit) + self.apply_ocr() + return self.get_text() + + def process_spreadsheet(self, doc): + """ + Process CSV and Excel files. + """ file_content = doc.get_content() if doc.file_type == "CSV": - file_content_str = self._decode_csv_content(file_content) + file_content_str = self.decode_csv_content(file_content) rows = read_csv_content(file_content_str) elif doc.file_type == "XLSX": rows = read_xlsx_file_from_attached_file(fcontent=file_content) @@ -51,10 +46,12 @@ def _process_spreadsheet(self, doc): rows = read_xls_file_from_attached_file(file_content) # Convert rows to a formatted string representation - return self._format_rows_as_text(rows) + return self.format_rows_as_text(rows) - def _decode_csv_content(self, content): - """Decode CSV file content with fallback encodings.""" + def decode_csv_content(self, content): + """ + Decode CSV file content with fallback encodings. + """ # If content is already a string, return as-is if isinstance(content, str): return content @@ -78,8 +75,10 @@ def _decode_csv_content(self, content): ) ) - def _format_rows_as_text(self, rows): - """Convert rows to a text format suitable for AI processing.""" + def format_rows_as_text(self, rows): + """ + Convert rows to a text format suitable for AI processing. + """ if not rows: frappe.throw(_("No data found in the file.")) @@ -116,7 +115,7 @@ def _format_rows_as_text(self, rows): return "\n".join(text_parts) - def _remove_extra_pages(self, page_limit=None): + def remove_extra_pages(self, page_limit=None): if not page_limit: return @@ -133,7 +132,7 @@ def _remove_extra_pages(self, page_limit=None): self.file = temp_file self.file.seek(0) - def _apply_ocr(self): + def apply_ocr(self): doc = pymupdf.open(stream=self.file, filetype="pdf") pages_to_ocr = [ str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() @@ -159,7 +158,7 @@ def _apply_ocr(self): self.file = temp_file self.file.seek(0) - def _get_text(self): + def get_text(self): text = "" doc = pymupdf.open(stream=self.file, filetype="pdf") for page in doc: From cd16721bea292e6f18e11af5303e7b3c6875c546 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Fri, 13 Mar 2026 10:46:40 +0530 Subject: [PATCH 04/26] refactor: enhance type annotations and improve method signatures in FileProcessor --- .../utils/file_processor.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index 1ac90e9..b2a80e8 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -4,34 +4,45 @@ import ocrmypdf import pymupdf from frappe import _ +from frappe.core.doctype.file.file import File from frappe.utils.csvutils import read_csv_content from frappe.utils.xlsxutils import ( read_xls_file_from_attached_file, read_xlsx_file_from_attached_file, ) +# TODO: Make some method static +# TODO: Remove self.file logic +# TODO: Add DI like can use OCR or Docling for PDF processing + class FileProcessor: """ Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content. """ - def get_content(self, doc, page_limit=None): + def get_content(self, doc: File, page_limit: int | None = None) -> str | None: if doc.file_type == "PDF": return self.process_pdf(doc, page_limit) - elif doc.file_type in ["CSV", "XLSX", "XLS"]: + + if doc.file_type in ("CSV", "XLSX", "XLS"): return self.process_spreadsheet(doc) - else: - frappe.throw(_("Only PDF, CSV, and Excel files are supported")) - def process_pdf(self, doc, page_limit=None): - """Process PDF files with OCR and page limiting.""" + frappe.throw( + title=_("Unsupported File Type"), + msg=_("Only PDF, CSV, and Excel files are supported"), + ) + + def process_pdf(self, doc: File, page_limit: int | None = None) -> str: + """ + Process PDF files with OCR and page limiting. + """ self.file = io.BytesIO(doc.get_content()) self.remove_extra_pages(page_limit) self.apply_ocr() return self.get_text() - def process_spreadsheet(self, doc): + def process_spreadsheet(self, doc: File) -> str: """ Process CSV and Excel files. """ @@ -48,7 +59,7 @@ def process_spreadsheet(self, doc): # Convert rows to a formatted string representation return self.format_rows_as_text(rows) - def decode_csv_content(self, content): + def decode_csv_content(self, content: str | bytes) -> str: """ Decode CSV file content with fallback encodings. """ @@ -75,7 +86,7 @@ def decode_csv_content(self, content): ) ) - def format_rows_as_text(self, rows): + def format_rows_as_text(self, rows: list) -> str: """ Convert rows to a text format suitable for AI processing. """ @@ -115,7 +126,7 @@ def format_rows_as_text(self, rows): return "\n".join(text_parts) - def remove_extra_pages(self, page_limit=None): + def remove_extra_pages(self, page_limit: int | None = None): if not page_limit: return @@ -158,7 +169,7 @@ def apply_ocr(self): self.file = temp_file self.file.seek(0) - def get_text(self): + def get_text(self) -> str: text = "" doc = pymupdf.open(stream=self.file, filetype="pdf") for page in doc: From fa12e4d3f82731a7c0879c500df6c6673f568d72 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Fri, 13 Mar 2026 10:59:38 +0530 Subject: [PATCH 05/26] refactor: streamline PDF processing methods and enhance file handling --- .../utils/file_processor.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index b2a80e8..a859c1f 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -37,10 +37,11 @@ def process_pdf(self, doc: File, page_limit: int | None = None) -> str: """ Process PDF files with OCR and page limiting. """ - self.file = io.BytesIO(doc.get_content()) - self.remove_extra_pages(page_limit) - self.apply_ocr() - return self.get_text() + file = io.BytesIO(doc.get_content()) + file = self.trim_pages(file, page_limit) + file = self.apply_ocr(file) + + return self.get_text(file) def process_spreadsheet(self, doc: File) -> str: """ @@ -126,11 +127,11 @@ def format_rows_as_text(self, rows: list) -> str: return "\n".join(text_parts) - def remove_extra_pages(self, page_limit: int | None = None): - if not page_limit: - return + def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO: + if not page_limit or page_limit <= 0: + return file - input_pdf = pymupdf.open(stream=self.file, filetype="pdf") + input_pdf = pymupdf.open(stream=file, filetype="pdf") output_pdf = pymupdf.open() output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1) @@ -140,25 +141,25 @@ def remove_extra_pages(self, page_limit: int | None = None): output_pdf.close() input_pdf.close() - self.file = temp_file - self.file.seek(0) + temp_file.seek(0) + return temp_file - def apply_ocr(self): - doc = pymupdf.open(stream=self.file, filetype="pdf") + def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: + doc = pymupdf.open(stream=file, filetype="pdf") pages_to_ocr = [ str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() ] if not pages_to_ocr: - return + return file pages = ",".join(pages_to_ocr) temp_file = io.BytesIO() - self.file.seek(0) + file.seek(0) ocrmypdf.ocr( - input_file=self.file, + input_file=file, output_file=temp_file, pages=pages, progress_bar=False, @@ -166,12 +167,12 @@ def apply_ocr(self): force_ocr=True, ) - self.file = temp_file - self.file.seek(0) + temp_file.seek(0) + return temp_file - def get_text(self) -> str: + def get_text(self, file: io.BytesIO) -> str: text = "" - doc = pymupdf.open(stream=self.file, filetype="pdf") + doc = pymupdf.open(stream=file, filetype="pdf") for page in doc: text += page.get_text("text") From 9628d83c6057a7fcdb9f89b69e2cfec264dca91b Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Fri, 13 Mar 2026 11:02:10 +0530 Subject: [PATCH 06/26] fix: add page limit check for PDF processing in FileProcessor --- .../transaction_parser/utils/file_processor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index a859c1f..b9c6f56 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -132,6 +132,11 @@ def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.Byte return file input_pdf = pymupdf.open(stream=file, filetype="pdf") + + if input_pdf.page_count <= page_limit: + input_pdf.close() + return file + output_pdf = pymupdf.open() output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1) From ae1ea00a2b052865e21038da367e04539410f5d4 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Fri, 13 Mar 2026 12:34:15 +0530 Subject: [PATCH 07/26] feat: add PDF processor selection and integration for enhanced document handling --- .../transaction_parser_settings.json | 12 +- .../utils/file_processor.py | 92 ++-------- .../transaction_parser/utils/pdf_processor.py | 171 ++++++++++++++++++ 3 files changed, 202 insertions(+), 73 deletions(-) create mode 100644 transaction_parser/transaction_parser/utils/pdf_processor.py diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json index 73a5a16..0071f7c 100644 --- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json +++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json @@ -9,6 +9,7 @@ "enabled", "ai_model_section", "default_ai_model", + "pdf_processor", "api_keys", "transaction_configurations_section", "invoice_lookback_count", @@ -92,6 +93,15 @@ "mandatory_depends_on": "eval: doc.enabled", "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash" }, + { + "default": "OCR", + "depends_on": "eval: doc.enabled", + "description": "Select the library to use for PDF text extraction. OCR uses PyMuPDF + OCRmyPDF. Docling provides advanced document understanding.", + "fieldname": "pdf_processor", + "fieldtype": "Select", + "label": "PDF Processor", + "options": "OCR\nDocling" + }, { "depends_on": "eval: doc.enabled", "fieldname": "api_keys", @@ -177,4 +187,4 @@ "sort_field": "modified", "sort_order": "DESC", "states": [] -} \ No newline at end of file +} diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index b9c6f56..6e2d098 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -1,8 +1,6 @@ import io import frappe -import ocrmypdf -import pymupdf from frappe import _ from frappe.core.doctype.file.file import File from frappe.utils.csvutils import read_csv_content @@ -11,9 +9,10 @@ read_xlsx_file_from_attached_file, ) -# TODO: Make some method static -# TODO: Remove self.file logic -# TODO: Add DI like can use OCR or Docling for PDF processing +from transaction_parser.transaction_parser.utils.pdf_processor import ( + BasePDFProcessor, + get_pdf_processor, +) class FileProcessor: @@ -21,9 +20,14 @@ class FileProcessor: Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content. """ - def get_content(self, doc: File, page_limit: int | None = None) -> str | None: + def get_content( + self, + doc: File, + page_limit: int | None = None, + pdf_processor: BasePDFProcessor | None = None, + ) -> str | None: if doc.file_type == "PDF": - return self.process_pdf(doc, page_limit) + return self.process_pdf(doc, page_limit, pdf_processor) if doc.file_type in ("CSV", "XLSX", "XLS"): return self.process_spreadsheet(doc) @@ -33,15 +37,17 @@ def get_content(self, doc: File, page_limit: int | None = None) -> str | None: msg=_("Only PDF, CSV, and Excel files are supported"), ) - def process_pdf(self, doc: File, page_limit: int | None = None) -> str: + def process_pdf( + self, + doc: File, + page_limit: int | None = None, + pdf_processor: BasePDFProcessor | None = None, + ) -> str: """ - Process PDF files with OCR and page limiting. + Process PDF files using the configured PDF processor strategy. """ - file = io.BytesIO(doc.get_content()) - file = self.trim_pages(file, page_limit) - file = self.apply_ocr(file) - - return self.get_text(file) + pdf_processor = pdf_processor or get_pdf_processor() + return pdf_processor.process(doc, page_limit) def process_spreadsheet(self, doc: File) -> str: """ @@ -126,61 +132,3 @@ def format_rows_as_text(self, rows: list) -> str: text_parts.append(f"Total columns: {len(rows[0])}") return "\n".join(text_parts) - - def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO: - if not page_limit or page_limit <= 0: - return file - - input_pdf = pymupdf.open(stream=file, filetype="pdf") - - if input_pdf.page_count <= page_limit: - input_pdf.close() - return file - - output_pdf = pymupdf.open() - output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1) - - temp_file = io.BytesIO() - output_pdf.save(temp_file) - - output_pdf.close() - input_pdf.close() - - temp_file.seek(0) - return temp_file - - def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: - doc = pymupdf.open(stream=file, filetype="pdf") - pages_to_ocr = [ - str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() - ] - - if not pages_to_ocr: - return file - - pages = ",".join(pages_to_ocr) - - temp_file = io.BytesIO() - file.seek(0) - - ocrmypdf.ocr( - input_file=file, - output_file=temp_file, - pages=pages, - progress_bar=False, - rotate_pages=True, - force_ocr=True, - ) - - temp_file.seek(0) - return temp_file - - def get_text(self, file: io.BytesIO) -> str: - text = "" - doc = pymupdf.open(stream=file, filetype="pdf") - for page in doc: - text += page.get_text("text") - - doc.close() - - return text diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py new file mode 100644 index 0000000..426c3b6 --- /dev/null +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -0,0 +1,171 @@ +import io +from abc import ABC, abstractmethod + +import frappe +import ocrmypdf +import pymupdf +from docling.datamodel.base_models import DocumentStream +from docling.document_converter import DocumentConverter +from frappe import _ +from frappe.core.doctype.file.file import File + + +class BasePDFProcessor(ABC): + """ + Abstract base class for PDF processors. + + To add a new processor: + 1. Create a new file in pdf_processors/ + 2. Subclass BasePDFProcessor + 3. Implement the `process` method + 4. Register it in pdf_processors/__init__.py PDF_PROCESSORS dict + """ + + @abstractmethod + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + """ + Process a PDF file and return extracted text. + + Args: + file: PDF file as BytesIO stream or Frappe File document + page_limit: Maximum number of pages to process (None = all pages) + + Returns: + Extracted text content from the PDF + """ + ... + + def get_sanitized_file( + self, file: io.BytesIO | File, page_limit: int | None = None + ) -> io.BytesIO: + """Get file as BytesIO stream and trim pages if needed.""" + if isinstance(file, File): + file = io.BytesIO(file.get_content()) + + return self.trim_pages(file, page_limit) + + def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO: + if not page_limit or page_limit <= 0: + return file + + input_pdf = pymupdf.open(stream=file, filetype="pdf") + + if input_pdf.page_count <= page_limit: + input_pdf.close() + return file + + output_pdf = pymupdf.open() + output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1) + + temp_file = io.BytesIO() + output_pdf.save(temp_file) + + output_pdf.close() + input_pdf.close() + + temp_file.seek(0) + return temp_file + + +class DoclingPDFProcessor(BasePDFProcessor): + """ + PDF processor using Docling for document conversion and text extraction. + + Docling provides advanced document understanding including table detection, + formula recognition, reading order detection, and OCR. + """ + + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + file = self.get_sanitized_file(file, page_limit) + + source = DocumentStream(name="document.pdf", stream=file) + converter = DocumentConverter() + result = converter.convert(source) + + return result.document.export_to_markdown() + + +class OCRPDFProcessor(BasePDFProcessor): + """ + PDF processor using PyMuPDF for text extraction and OCRmyPDF for OCR. + """ + + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + file = self.get_sanitized_file(file, page_limit) + file = self.apply_ocr(file) + + return self.get_text(file) + + def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: + doc = pymupdf.open(stream=file, filetype="pdf") + pages_to_ocr = [ + str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() + ] + + doc.close() + + if not pages_to_ocr: + return file + + pages = ",".join(pages_to_ocr) + + temp_file = io.BytesIO() + file.seek(0) + + ocrmypdf.ocr( + input_file=file, + output_file=temp_file, + pages=pages, + progress_bar=False, + rotate_pages=True, + force_ocr=True, + ) + + temp_file.seek(0) + return temp_file + + def get_text(self, file: io.BytesIO) -> str: + text = "" + doc = pymupdf.open(stream=file, filetype="pdf") + + for page in doc: + text += page.get_text("text") + + doc.close() + + return text + + +# Registry: add new processors here +PDF_PROCESSORS: dict[str, type[BasePDFProcessor]] = { + "OCR": OCRPDFProcessor, + "Docling": DoclingPDFProcessor, +} + +DEFAULT_PDF_PROCESSOR = "OCR" + + +def get_pdf_processor(name: str | None = None) -> BasePDFProcessor: + """ + Factory function to get a PDF processor by name. + + Usage: + + ``` + processor = get_pdf_processor("OCR") + text = processor.process(file, page_limit=5) + ``` + """ + name = name or DEFAULT_PDF_PROCESSOR + + processor_class = PDF_PROCESSORS.get(name) + if not processor_class: + supported = ", ".join(PDF_PROCESSORS.keys()) + frappe.throw( + title=_("Unsupported PDF Processor"), + msg=_("PDF Processor '{0}' is not supported.
Choose from: {1}").format( + name, supported + ), + ) + + return processor_class() From 5910f00151ba06810a8b3e2319ee8bbb88686616 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Sat, 14 Mar 2026 18:06:02 +0530 Subject: [PATCH 08/26] fix: update default PDF processor to Docling and refine description --- .../transaction_parser_settings.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json index 0071f7c..09728ac 100644 --- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json +++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json @@ -94,13 +94,13 @@ "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash" }, { - "default": "OCR", + "default": "Docling", "depends_on": "eval: doc.enabled", - "description": "Select the library to use for PDF text extraction. OCR uses PyMuPDF + OCRmyPDF. Docling provides advanced document understanding.", + "description": "Select the library to use for PDF text extraction", "fieldname": "pdf_processor", "fieldtype": "Select", "label": "PDF Processor", - "options": "OCR\nDocling" + "options": "Docling\nOCRmyPDF" }, { "depends_on": "eval: doc.enabled", @@ -166,7 +166,7 @@ "index_web_pages_for_search": 1, "issingle": 1, "links": [], - "modified": "2025-09-08 08:48:58.870032", + "modified": "2026-03-14 13:35:17.150533", "modified_by": "Administrator", "module": "Transaction Parser", "name": "Transaction Parser Settings", @@ -187,4 +187,4 @@ "sort_field": "modified", "sort_order": "DESC", "states": [] -} +} \ No newline at end of file From 002b433b9dcdc63ee7a056e70d30db6acd80129e Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Sun, 15 Mar 2026 18:16:45 +0530 Subject: [PATCH 09/26] fix: enhance DoclingPDFProcessor with converter setup and add PDF pipeline options --- .../transaction_parser/utils/pdf_processor.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 426c3b6..4e78c63 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -5,7 +5,8 @@ import ocrmypdf import pymupdf from docling.datamodel.base_models import DocumentStream -from docling.document_converter import DocumentConverter +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption from frappe import _ from frappe.core.doctype.file.file import File @@ -33,7 +34,7 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str Returns: Extracted text content from the PDF """ - ... + pass def get_sanitized_file( self, file: io.BytesIO | File, page_limit: int | None = None @@ -79,11 +80,21 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str file = self.get_sanitized_file(file, page_limit) source = DocumentStream(name="document.pdf", stream=file) - converter = DocumentConverter() + converter = self._get_converter() result = converter.convert(source) return result.document.export_to_markdown() + def _get_converter(self) -> DocumentConverter: + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False # TODO: OCR Setup + + return DocumentConverter( + format_options={ + "pdf": PdfFormatOption(pipeline_options=pipeline_options), + } + ) + class OCRPDFProcessor(BasePDFProcessor): """ From 3d43a256894bd4889a37441083ef8a44f42641dd Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Mon, 16 Mar 2026 10:44:22 +0530 Subject: [PATCH 10/26] feat: implement PDF processor selection and default setting for enhanced document handling --- transaction_parser/hooks.py | 5 + transaction_parser/patches.txt | 3 +- .../patches/set_default_pdf_processor.py | 11 ++ .../transaction_parser_settings.json | 6 +- .../utils/file_processor.py | 6 +- .../transaction_parser/utils/pdf_processor.py | 103 ++++++++++-------- 6 files changed, 83 insertions(+), 51 deletions(-) create mode 100644 transaction_parser/patches/set_default_pdf_processor.py diff --git a/transaction_parser/hooks.py b/transaction_parser/hooks.py index d389f4f..a944370 100644 --- a/transaction_parser/hooks.py +++ b/transaction_parser/hooks.py @@ -27,3 +27,8 @@ "on_update": "transaction_parser.transaction_parser.overrides.communication.on_update", } } + +pdf_processors = { + "OCRMyPDF": "transaction_parser.transaction_parser.utils.pdf_processor.OCRMyPDFProcessor", + "Docling": "transaction_parser.transaction_parser.utils.pdf_processor.DoclingPDFProcessor", +} diff --git a/transaction_parser/patches.txt b/transaction_parser/patches.txt index 8096de1..27002f4 100644 --- a/transaction_parser/patches.txt +++ b/transaction_parser/patches.txt @@ -4,4 +4,5 @@ [post_model_sync] # Patches added in this section will be executed after doctypes are migrated -execute:from transaction_parser.install import after_install; after_install() #2 \ No newline at end of file +execute:from transaction_parser.install import after_install; after_install() #2 +transaction_parser.patches.set_default_pdf_processor #1 diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py new file mode 100644 index 0000000..c3ac3d4 --- /dev/null +++ b/transaction_parser/patches/set_default_pdf_processor.py @@ -0,0 +1,11 @@ +import frappe + +from transaction_parser.transaction_parser.utils.pdf_processor import ( + DEFAULT_PDF_PROCESSOR, +) + + +def execute(): + frappe.db.set_single_value( + "Transaction Parser Settings", "pdf_processor", DEFAULT_PDF_PROCESSOR + ) diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json index 09728ac..53dc83c 100644 --- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json +++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json @@ -94,13 +94,13 @@ "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash" }, { - "default": "Docling", + "default": "OCRMyPDF", "depends_on": "eval: doc.enabled", "description": "Select the library to use for PDF text extraction", "fieldname": "pdf_processor", "fieldtype": "Select", "label": "PDF Processor", - "options": "Docling\nOCRmyPDF" + "options": "OCRMyPDF\nDocling" }, { "depends_on": "eval: doc.enabled", @@ -187,4 +187,4 @@ "sort_field": "modified", "sort_order": "DESC", "states": [] -} \ No newline at end of file +} diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index 6e2d098..c73f58b 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -10,7 +10,7 @@ ) from transaction_parser.transaction_parser.utils.pdf_processor import ( - BasePDFProcessor, + PDFProcessor, get_pdf_processor, ) @@ -24,7 +24,7 @@ def get_content( self, doc: File, page_limit: int | None = None, - pdf_processor: BasePDFProcessor | None = None, + pdf_processor: PDFProcessor | None = None, ) -> str | None: if doc.file_type == "PDF": return self.process_pdf(doc, page_limit, pdf_processor) @@ -41,7 +41,7 @@ def process_pdf( self, doc: File, page_limit: int | None = None, - pdf_processor: BasePDFProcessor | None = None, + pdf_processor: PDFProcessor | None = None, ) -> str: """ Process PDF files using the configured PDF processor strategy. diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 4e78c63..a87237b 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -10,16 +10,24 @@ from frappe import _ from frappe.core.doctype.file.file import File +DEFAULT_PDF_PROCESSOR = "OCRMyPDF" -class BasePDFProcessor(ABC): + +class PDFProcessor(ABC): """ Abstract base class for PDF processors. - To add a new processor: - 1. Create a new file in pdf_processors/ - 2. Subclass BasePDFProcessor - 3. Implement the `process` method - 4. Register it in pdf_processors/__init__.py PDF_PROCESSORS dict + To add a new processor from another app: + + 1. Subclass PDFProcessor + 2. Implement the `process` method + 3. Register it via the `pdf_processors` hook in your app's hooks.py: + + ``` + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } + ``` """ @abstractmethod @@ -28,18 +36,20 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str Process a PDF file and return extracted text. Args: - file: PDF file as BytesIO stream or Frappe File document - page_limit: Maximum number of pages to process (None = all pages) + file: PDF file as BytesIO stream or Frappe File document + page_limit: Maximum number of pages to process (None = all pages) Returns: - Extracted text content from the PDF + Extracted text content from the PDF """ pass def get_sanitized_file( self, file: io.BytesIO | File, page_limit: int | None = None ) -> io.BytesIO: - """Get file as BytesIO stream and trim pages if needed.""" + """ + Get file as BytesIO stream and trim pages if needed. + """ if isinstance(file, File): file = io.BytesIO(file.get_content()) @@ -67,8 +77,19 @@ def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.Byte temp_file.seek(0) return temp_file + def get_text(self, file: io.BytesIO) -> str: + text = "" + doc = pymupdf.open(stream=file, filetype="pdf") -class DoclingPDFProcessor(BasePDFProcessor): + for page in doc: + text += page.get_text("text") + + doc.close() + + return text + + +class DoclingPDFProcessor(PDFProcessor): """ PDF processor using Docling for document conversion and text extraction. @@ -79,7 +100,7 @@ class DoclingPDFProcessor(BasePDFProcessor): def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: file = self.get_sanitized_file(file, page_limit) - source = DocumentStream(name="document.pdf", stream=file) + source = DocumentStream(name="document.pdf", stream=file) # temporary name converter = self._get_converter() result = converter.convert(source) @@ -96,7 +117,7 @@ def _get_converter(self) -> DocumentConverter: ) -class OCRPDFProcessor(BasePDFProcessor): +class OCRMyPDFProcessor(PDFProcessor): """ PDF processor using PyMuPDF for text extraction and OCRmyPDF for OCR. """ @@ -135,48 +156,42 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: temp_file.seek(0) return temp_file - def get_text(self, file: io.BytesIO) -> str: - text = "" - doc = pymupdf.open(stream=file, filetype="pdf") - - for page in doc: - text += page.get_text("text") - - doc.close() - - return text - -# Registry: add new processors here -PDF_PROCESSORS: dict[str, type[BasePDFProcessor]] = { - "OCR": OCRPDFProcessor, - "Docling": DoclingPDFProcessor, -} - -DEFAULT_PDF_PROCESSOR = "OCR" - - -def get_pdf_processor(name: str | None = None) -> BasePDFProcessor: +@frappe.request_cache +def get_pdf_processor(name: str | None = None) -> PDFProcessor: """ Factory function to get a PDF processor by name. Usage: - ``` - processor = get_pdf_processor("OCR") - text = processor.process(file, page_limit=5) - ``` + ``` + processor = get_pdf_processor("OCR") + text = processor.process(file, page_limit=5) + ``` + + To register a custom processor from another app, add to its hooks.py: + + ``` + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } + ``` """ - name = name or DEFAULT_PDF_PROCESSOR + if not name: + name = ( + frappe.db.get_single_value("Transaction Parser Settings", "pdf_processor") + or DEFAULT_PDF_PROCESSOR + ) + + processors = frappe.get_hooks("pdf_processors") or {} + class_path = (processors.get(name) or [None])[-1] - processor_class = PDF_PROCESSORS.get(name) - if not processor_class: - supported = ", ".join(PDF_PROCESSORS.keys()) + if not class_path: frappe.throw( title=_("Unsupported PDF Processor"), msg=_("PDF Processor '{0}' is not supported.
Choose from: {1}").format( - name, supported + name, ", ".join(processors.keys()) ), ) - return processor_class() + return frappe.get_attr(class_path)() From 04fedc32c2aff780b72f5d92e995ec9531e71929 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 10:44:04 +0530 Subject: [PATCH 11/26] fix: refactor DoclingPDFProcessor to import necessary modules locally --- .../transaction_parser/utils/pdf_processor.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index a87237b..f86824c 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -2,11 +2,7 @@ from abc import ABC, abstractmethod import frappe -import ocrmypdf import pymupdf -from docling.datamodel.base_models import DocumentStream -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption from frappe import _ from frappe.core.doctype.file.file import File @@ -98,6 +94,8 @@ class DoclingPDFProcessor(PDFProcessor): """ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + from docling.datamodel.base_models import DocumentStream + file = self.get_sanitized_file(file, page_limit) source = DocumentStream(name="document.pdf", stream=file) # temporary name @@ -106,7 +104,10 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str return result.document.export_to_markdown() - def _get_converter(self) -> DocumentConverter: + def _get_converter(self): + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption + pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False # TODO: OCR Setup @@ -129,6 +130,8 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str return self.get_text(file) def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: + import ocrmypdf + doc = pymupdf.open(stream=file, filetype="pdf") pages_to_ocr = [ str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() From c0ebb89ac7499cfc669e8a93188d677ee72319d6 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 10:49:20 +0530 Subject: [PATCH 12/26] revert: remove unnecessary request cache decorator from get_pdf_processor function --- transaction_parser/transaction_parser/utils/pdf_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index f86824c..2a9dbb2 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -160,7 +160,6 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: return temp_file -@frappe.request_cache def get_pdf_processor(name: str | None = None) -> PDFProcessor: """ Factory function to get a PDF processor by name. From 51b54c5263968b843da64a5f4587896db0082f29 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 10:56:21 +0530 Subject: [PATCH 13/26] chore: add comment to clarify processor resolution order --- transaction_parser/transaction_parser/utils/pdf_processor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 2a9dbb2..169c1af 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -186,6 +186,8 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor: ) processors = frappe.get_hooks("pdf_processors") or {} + + # [-1] → last in resolution order app's overrides will take precedence class_path = (processors.get(name) or [None])[-1] if not class_path: From d15a87b3c3ce2349335d8b3bb7b6168480f905c7 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 11:00:34 +0530 Subject: [PATCH 14/26] fix: optimize DoclingPDFProcessor to use a singleton converter instance --- .../transaction_parser/utils/pdf_processor.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 169c1af..7cc47ba 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -93,6 +93,8 @@ class DoclingPDFProcessor(PDFProcessor): formula recognition, reading order detection, and OCR. """ + _converter = None + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: from docling.datamodel.base_models import DocumentStream @@ -105,17 +107,20 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str return result.document.export_to_markdown() def _get_converter(self): - from docling.datamodel.pipeline_options import PdfPipelineOptions - from docling.document_converter import DocumentConverter, PdfFormatOption + if DoclingPDFProcessor._converter is None: + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption - pipeline_options = PdfPipelineOptions() - pipeline_options.do_ocr = False # TODO: OCR Setup + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False # TODO: OCR Setup - return DocumentConverter( - format_options={ - "pdf": PdfFormatOption(pipeline_options=pipeline_options), - } - ) + DoclingPDFProcessor._converter = DocumentConverter( + format_options={ + "pdf": PdfFormatOption(pipeline_options=pipeline_options), + } + ) + + return DoclingPDFProcessor._converter class OCRMyPDFProcessor(PDFProcessor): From 1e9825df9fcde0a938e9ebf6cd570f2e2a87afe9 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 11:22:46 +0530 Subject: [PATCH 15/26] fix: reset file pointer before returning in OCRMyPDFProcessor --- transaction_parser/transaction_parser/utils/pdf_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 7cc47ba..2334d57 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -145,6 +145,7 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: doc.close() if not pages_to_ocr: + file.seek(0) return file pages = ",".join(pages_to_ocr) From 5b5d7f1bc4658c3bdd51f3f8682ffb348ce4f8f9 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 12:19:06 +0530 Subject: [PATCH 16/26] fix: enhance error handling for unsupported spreadsheet file types --- .../transaction_parser/utils/file_processor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index c73f58b..7423a74 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -62,6 +62,13 @@ def process_spreadsheet(self, doc: File) -> str: rows = read_xlsx_file_from_attached_file(fcontent=file_content) elif doc.file_type == "XLS": rows = read_xls_file_from_attached_file(file_content) + else: + frappe.throw( + title=_("Unsupported File Type"), + msg=_( + "Cannot process spreadsheet with file type: {0}.
Supported types are CSV, XLSX, and XLS." + ).format(doc.file_type), + ) # Convert rows to a formatted string representation return self.format_rows_as_text(rows) From e9004cdec001ae60d976d0a3796fa043003f1b2f Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 12:34:49 +0530 Subject: [PATCH 17/26] chore: fix typo --- transaction_parser/transaction_parser/utils/pdf_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 2334d57..203fd52 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -125,7 +125,7 @@ def _get_converter(self): class OCRMyPDFProcessor(PDFProcessor): """ - PDF processor using PyMuPDF for text extraction and OCRmyPDF for OCR. + PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR. """ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: From 505f7f129f4daa07c0c69f8b89fcf2dbbfbd1667 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 14:18:59 +0530 Subject: [PATCH 18/26] fix: improve formatting and clarity in PDFProcessor documentation --- .../transaction_parser/utils/pdf_processor.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 203fd52..268d0e8 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -15,14 +15,14 @@ class PDFProcessor(ABC): To add a new processor from another app: - 1. Subclass PDFProcessor - 2. Implement the `process` method - 3. Register it via the `pdf_processors` hook in your app's hooks.py: + 1. Subclass PDFProcessor + 2. Implement the `process` method + 3. Register it via the `pdf_processors` hook in your app's hooks.py: ``` - pdf_processors = { - "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", - } + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } ``` """ @@ -32,11 +32,11 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str Process a PDF file and return extracted text. Args: - file: PDF file as BytesIO stream or Frappe File document - page_limit: Maximum number of pages to process (None = all pages) + file: PDF file as BytesIO stream or Frappe File document + page_limit: Maximum number of pages to process (None = all pages) Returns: - Extracted text content from the PDF + Extracted text content from the PDF """ pass @@ -108,6 +108,7 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str def _get_converter(self): if DoclingPDFProcessor._converter is None: + from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -116,7 +117,7 @@ def _get_converter(self): DoclingPDFProcessor._converter = DocumentConverter( format_options={ - "pdf": PdfFormatOption(pipeline_options=pipeline_options), + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), } ) @@ -172,18 +173,18 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor: Usage: - ``` - processor = get_pdf_processor("OCR") - text = processor.process(file, page_limit=5) - ``` + ``` + processor = get_pdf_processor("OCR") + text = processor.process(file, page_limit=5) + ``` To register a custom processor from another app, add to its hooks.py: - ``` - pdf_processors = { - "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", - } - ``` + ``` + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } + ``` """ if not name: name = ( From fef2bb14a79f73e3e3f90b28547035b8647669d6 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 14:38:19 +0530 Subject: [PATCH 19/26] fix: add check for existing PDF processor setting before setting default --- transaction_parser/patches/set_default_pdf_processor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py index c3ac3d4..3d9fad3 100644 --- a/transaction_parser/patches/set_default_pdf_processor.py +++ b/transaction_parser/patches/set_default_pdf_processor.py @@ -6,6 +6,8 @@ def execute(): - frappe.db.set_single_value( - "Transaction Parser Settings", "pdf_processor", DEFAULT_PDF_PROCESSOR - ) + DOCTYPE = "Transaction Parser Settings" + FIELD = "pdf_processor" + + if not frappe.db.get_single_value(DOCTYPE, FIELD): + frappe.db.set_single_value(DOCTYPE, FIELD, DEFAULT_PDF_PROCESSOR) From fd8d700cf7e48d6f3c682da7eace28b75873f66b Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 14:41:23 +0530 Subject: [PATCH 20/26] fix: enhance error handling in DoclingPDFProcessor for conversion status --- .../transaction_parser/utils/pdf_processor.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 268d0e8..621ca99 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -96,7 +96,7 @@ class DoclingPDFProcessor(PDFProcessor): _converter = None def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: - from docling.datamodel.base_models import DocumentStream + from docling.datamodel.base_models import ConversionStatus, DocumentStream file = self.get_sanitized_file(file, page_limit) @@ -104,6 +104,17 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str converter = self._get_converter() result = converter.convert(source) + if result.status not in ( + ConversionStatus.SUCCESS, + ConversionStatus.PARTIAL_SUCCESS, + ): + frappe.throw( + title=_("PDF Reading Failed"), + msg=_("Docling failed to read the document. Status: {0}").format( + result.status + ), + ) + return result.document.export_to_markdown() def _get_converter(self): From 426332ec4dd48979d8385d7ea1af725a9bac2546 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 15:13:48 +0530 Subject: [PATCH 21/26] fix: improve CSV content decoding by refining fallback encodings --- .../transaction_parser/utils/file_processor.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index 7423a74..be1667a 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -82,17 +82,15 @@ def decode_csv_content(self, content: str | bytes) -> str: return content # If content is bytes, decode it - encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"] - - for encoding in encodings: + for encoding in ("utf-8", "utf-8-sig", "cp1252"): try: return content.decode(encoding) except UnicodeDecodeError: continue - # If all encodings fail, try with error handling + # Latin-1 never raises; use as final fallback before giving up try: - return content.decode("utf-8", errors="replace") + return content.decode("latin1") except Exception: frappe.throw( _( From a47f59a605bc2484455a2058fd76c43fbc37fd9b Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 15:27:48 +0530 Subject: [PATCH 22/26] chore: minor fix --- transaction_parser/patches/__init__.py | 0 .../transaction_parser/utils/file_processor.py | 15 ++++++--------- 2 files changed, 6 insertions(+), 9 deletions(-) create mode 100644 transaction_parser/patches/__init__.py diff --git a/transaction_parser/patches/__init__.py b/transaction_parser/patches/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index be1667a..3cebb9b 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -82,21 +82,18 @@ def decode_csv_content(self, content: str | bytes) -> str: return content # If content is bytes, decode it - for encoding in ("utf-8", "utf-8-sig", "cp1252"): + # ! Note: Always keep `latin1` as the last fallback encoding, as it can decode any byte sequence without errors (Garbage) + for encoding in ("utf-8", "utf-8-sig", "cp1252", "latin1"): try: return content.decode(encoding) except UnicodeDecodeError: continue - # Latin-1 never raises; use as final fallback before giving up - try: - return content.decode("latin1") - except Exception: - frappe.throw( - _( - "Unable to decode CSV file. Please ensure the file is saved with a supported encoding." - ) + frappe.throw( + _( + "Unable to decode CSV file. Please ensure the file is saved with a supported encoding." ) + ) def format_rows_as_text(self, rows: list) -> str: """ From b5690d168faafd3134ca37ed22ddb14df4e9460c Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 15:45:58 +0530 Subject: [PATCH 23/26] fix: reset file pointer in trim_pages method for proper PDF processing --- transaction_parser/transaction_parser/utils/pdf_processor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 621ca99..766c806 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -53,12 +53,14 @@ def get_sanitized_file( def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO: if not page_limit or page_limit <= 0: + file.seek(0) return file input_pdf = pymupdf.open(stream=file, filetype="pdf") if input_pdf.page_count <= page_limit: input_pdf.close() + file.seek(0) return file output_pdf = pymupdf.open() From 984bf6f1a986f971e7de5a4f909c10ee030286f3 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 16:13:32 +0530 Subject: [PATCH 24/26] chore: minor change --- transaction_parser/transaction_parser/utils/pdf_processor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 766c806..be17266 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -157,15 +157,14 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: ] doc.close() + file.seek(0) if not pages_to_ocr: - file.seek(0) return file pages = ",".join(pages_to_ocr) temp_file = io.BytesIO() - file.seek(0) ocrmypdf.ocr( input_file=file, From 22b35bc3fd3756eba99bbd85d0f7b538ba0cb8a3 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 16:54:43 +0530 Subject: [PATCH 25/26] fix: improve formatting and readability in PDFProcessor documentation --- .../transaction_parser/utils/pdf_processor.py | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index be17266..5cfd7d2 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -15,14 +15,14 @@ class PDFProcessor(ABC): To add a new processor from another app: - 1. Subclass PDFProcessor - 2. Implement the `process` method - 3. Register it via the `pdf_processors` hook in your app's hooks.py: + 1. Subclass PDFProcessor + 2. Implement the `process` method + 3. Register it via the `pdf_processors` hook in your app's hooks.py: ``` - pdf_processors = { - "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", - } + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } ``` """ @@ -32,11 +32,11 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str Process a PDF file and return extracted text. Args: - file: PDF file as BytesIO stream or Frappe File document - page_limit: Maximum number of pages to process (None = all pages) + file: PDF file as BytesIO stream or Frappe File document + page_limit: Maximum number of pages to process (None = all pages) Returns: - Extracted text content from the PDF + Extracted text content from the PDF """ pass @@ -106,15 +106,18 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str converter = self._get_converter() result = converter.convert(source) - if result.status not in ( - ConversionStatus.SUCCESS, - ConversionStatus.PARTIAL_SUCCESS, + if ( + not result + or not result.document + or result.status + not in ( + ConversionStatus.SUCCESS, + ConversionStatus.PARTIAL_SUCCESS, + ) ): frappe.throw( title=_("PDF Reading Failed"), - msg=_("Docling failed to read the document. Status: {0}").format( - result.status - ), + msg=_("Docling failed to read the document."), ) return result.document.export_to_markdown() @@ -185,18 +188,18 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor: Usage: - ``` - processor = get_pdf_processor("OCR") - text = processor.process(file, page_limit=5) - ``` + ``` + processor = get_pdf_processor("OCR") + text = processor.process(file, page_limit=5) + ``` To register a custom processor from another app, add to its hooks.py: - ``` - pdf_processors = { - "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", - } - ``` + ``` + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } + ``` """ if not name: name = ( From 9f02aafdc17e75b1f0d0909be48cc6f0514ac6d7 Mon Sep 17 00:00:00 2001 From: Abdeali Chharchhoda Date: Tue, 17 Mar 2026 17:21:18 +0530 Subject: [PATCH 26/26] chore: minor change --- transaction_parser/transaction_parser/ai_integration/parser.py | 2 +- transaction_parser/transaction_parser/utils/pdf_processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transaction_parser/transaction_parser/ai_integration/parser.py b/transaction_parser/transaction_parser/ai_integration/parser.py index f74eec2..6db6d3f 100644 --- a/transaction_parser/transaction_parser/ai_integration/parser.py +++ b/transaction_parser/transaction_parser/ai_integration/parser.py @@ -132,7 +132,7 @@ def get_api_key(self) -> str: _("API Key not found for model {0}").format(self.model.service_provider) ) - def get_content(self, response: dict) -> dict | str: + def get_content(self, response: dict) -> dict: """Extract content from API response.""" content = response["choices"][0]["message"]["content"] diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py index 5cfd7d2..8d0df88 100644 --- a/transaction_parser/transaction_parser/utils/pdf_processor.py +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -189,7 +189,7 @@ def get_pdf_processor(name: str | None = None) -> PDFProcessor: Usage: ``` - processor = get_pdf_processor("OCR") + processor = get_pdf_processor("Docling") text = processor.process(file, page_limit=5) ```