diff --git a/pyproject.toml b/pyproject.toml index 7d7d91b..a458259 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "rapidfuzz~=3.12.2", "pymupdf~=1.26.3", "openai", + "docling>=2.75.0", ] [build-system] diff --git a/transaction_parser/hooks.py b/transaction_parser/hooks.py index d389f4f..a944370 100644 --- a/transaction_parser/hooks.py +++ b/transaction_parser/hooks.py @@ -27,3 +27,8 @@ "on_update": "transaction_parser.transaction_parser.overrides.communication.on_update", } } + +pdf_processors = { + "OCRMyPDF": "transaction_parser.transaction_parser.utils.pdf_processor.OCRMyPDFProcessor", + "Docling": "transaction_parser.transaction_parser.utils.pdf_processor.DoclingPDFProcessor", +} diff --git a/transaction_parser/patches.txt b/transaction_parser/patches.txt index 8096de1..27002f4 100644 --- a/transaction_parser/patches.txt +++ b/transaction_parser/patches.txt @@ -4,4 +4,5 @@ [post_model_sync] # Patches added in this section will be executed after doctypes are migrated -execute:from transaction_parser.install import after_install; after_install() #2 \ No newline at end of file +execute:from transaction_parser.install import after_install; after_install() #2 +transaction_parser.patches.set_default_pdf_processor #1 diff --git a/transaction_parser/patches/__init__.py b/transaction_parser/patches/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/transaction_parser/patches/set_default_pdf_processor.py b/transaction_parser/patches/set_default_pdf_processor.py new file mode 100644 index 0000000..3d9fad3 --- /dev/null +++ b/transaction_parser/patches/set_default_pdf_processor.py @@ -0,0 +1,13 @@ +import frappe + +from transaction_parser.transaction_parser.utils.pdf_processor import ( + DEFAULT_PDF_PROCESSOR, +) + + +def execute(): + DOCTYPE = "Transaction Parser Settings" + FIELD = "pdf_processor" + + if not frappe.db.get_single_value(DOCTYPE, FIELD): + frappe.db.set_single_value(DOCTYPE, FIELD, DEFAULT_PDF_PROCESSOR) diff --git a/transaction_parser/transaction_parser/__init__.py b/transaction_parser/transaction_parser/__init__.py index 83555ce..abd27da 100644 --- a/transaction_parser/transaction_parser/__init__.py +++ b/transaction_parser/transaction_parser/__init__.py @@ -28,6 +28,7 @@ def parse(transaction, country, file_url, ai_model=None, page_limit=None): ai_model=cstr(ai_model), page_limit=cint(page_limit), queue="long", + now=frappe.conf.developer_mode, ) diff --git a/transaction_parser/transaction_parser/ai_integration/parser.py b/transaction_parser/transaction_parser/ai_integration/parser.py index f74eec2..6db6d3f 100644 --- a/transaction_parser/transaction_parser/ai_integration/parser.py +++ b/transaction_parser/transaction_parser/ai_integration/parser.py @@ -132,7 +132,7 @@ def get_api_key(self) -> str: _("API Key not found for model {0}").format(self.model.service_provider) ) - def get_content(self, response: dict) -> dict | str: + def get_content(self, response: dict) -> dict: """Extract content from API response.""" content = response["choices"][0]["message"]["content"] diff --git a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json index 73a5a16..53dc83c 100644 --- a/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json +++ b/transaction_parser/transaction_parser/doctype/transaction_parser_settings/transaction_parser_settings.json @@ -9,6 +9,7 @@ "enabled", "ai_model_section", "default_ai_model", + "pdf_processor", "api_keys", "transaction_configurations_section", "invoice_lookback_count", @@ -92,6 +93,15 @@ "mandatory_depends_on": "eval: doc.enabled", "options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash" }, + { + "default": "OCRMyPDF", + "depends_on": "eval: doc.enabled", + "description": "Select the library to use for PDF text extraction", + "fieldname": "pdf_processor", + "fieldtype": "Select", + "label": "PDF Processor", + "options": "OCRMyPDF\nDocling" + }, { "depends_on": "eval: doc.enabled", "fieldname": "api_keys", @@ -156,7 +166,7 @@ "index_web_pages_for_search": 1, "issingle": 1, "links": [], - "modified": "2025-09-08 08:48:58.870032", + "modified": "2026-03-14 13:35:17.150533", "modified_by": "Administrator", "module": "Transaction Parser", "name": "Transaction Parser Settings", @@ -177,4 +187,4 @@ "sort_field": "modified", "sort_order": "DESC", "states": [] -} \ No newline at end of file +} diff --git a/transaction_parser/transaction_parser/utils/file_processor.py b/transaction_parser/transaction_parser/utils/file_processor.py index 8a3ccd9..3cebb9b 100644 --- a/transaction_parser/transaction_parser/utils/file_processor.py +++ b/transaction_parser/transaction_parser/utils/file_processor.py @@ -1,76 +1,104 @@ import io import frappe -import ocrmypdf -import pymupdf from frappe import _ +from frappe.core.doctype.file.file import File from frappe.utils.csvutils import read_csv_content from frappe.utils.xlsxutils import ( read_xls_file_from_attached_file, read_xlsx_file_from_attached_file, ) +from transaction_parser.transaction_parser.utils.pdf_processor import ( + PDFProcessor, + get_pdf_processor, +) -class FileProcessor: - """Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.""" - def get_content(self, doc, page_limit=None): +class FileProcessor: + """ + Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content. + """ + + def get_content( + self, + doc: File, + page_limit: int | None = None, + pdf_processor: PDFProcessor | None = None, + ) -> str | None: if doc.file_type == "PDF": - return self._process_pdf(doc, page_limit) - elif doc.file_type in ["CSV", "XLSX", "XLS"]: - return self._process_spreadsheet(doc) - else: - frappe.throw(_("Only PDF, CSV, and Excel files are supported")) + return self.process_pdf(doc, page_limit, pdf_processor) - def _process_pdf(self, doc, page_limit=None): - """Process PDF files with OCR and page limiting.""" - self.file = io.BytesIO(doc.get_content()) - self._remove_extra_pages(page_limit) - self._apply_ocr() - return self._get_text() + if doc.file_type in ("CSV", "XLSX", "XLS"): + return self.process_spreadsheet(doc) - def _process_spreadsheet(self, doc): - """Process CSV and Excel files.""" + frappe.throw( + title=_("Unsupported File Type"), + msg=_("Only PDF, CSV, and Excel files are supported"), + ) + + def process_pdf( + self, + doc: File, + page_limit: int | None = None, + pdf_processor: PDFProcessor | None = None, + ) -> str: + """ + Process PDF files using the configured PDF processor strategy. + """ + pdf_processor = pdf_processor or get_pdf_processor() + return pdf_processor.process(doc, page_limit) + + def process_spreadsheet(self, doc: File) -> str: + """ + Process CSV and Excel files. + """ file_content = doc.get_content() if doc.file_type == "CSV": - file_content_str = self._decode_csv_content(file_content) + file_content_str = self.decode_csv_content(file_content) rows = read_csv_content(file_content_str) elif doc.file_type == "XLSX": rows = read_xlsx_file_from_attached_file(fcontent=file_content) elif doc.file_type == "XLS": rows = read_xls_file_from_attached_file(file_content) + else: + frappe.throw( + title=_("Unsupported File Type"), + msg=_( + "Cannot process spreadsheet with file type: {0}.
Supported types are CSV, XLSX, and XLS." + ).format(doc.file_type), + ) # Convert rows to a formatted string representation - return self._format_rows_as_text(rows) + return self.format_rows_as_text(rows) - def _decode_csv_content(self, content): - """Decode CSV file content with fallback encodings.""" + def decode_csv_content(self, content: str | bytes) -> str: + """ + Decode CSV file content with fallback encodings. + """ # If content is already a string, return as-is if isinstance(content, str): return content # If content is bytes, decode it - encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"] - - for encoding in encodings: + # ! Note: Always keep `latin1` as the last fallback encoding, as it can decode any byte sequence without errors (Garbage) + for encoding in ("utf-8", "utf-8-sig", "cp1252", "latin1"): try: return content.decode(encoding) except UnicodeDecodeError: continue - # If all encodings fail, try with error handling - try: - return content.decode("utf-8", errors="replace") - except Exception: - frappe.throw( - _( - "Unable to decode CSV file. Please ensure the file is saved with a supported encoding." - ) + frappe.throw( + _( + "Unable to decode CSV file. Please ensure the file is saved with a supported encoding." ) + ) - def _format_rows_as_text(self, rows): - """Convert rows to a text format suitable for AI processing.""" + def format_rows_as_text(self, rows: list) -> str: + """ + Convert rows to a text format suitable for AI processing. + """ if not rows: frappe.throw(_("No data found in the file.")) @@ -106,56 +134,3 @@ def _format_rows_as_text(self, rows): text_parts.append(f"Total columns: {len(rows[0])}") return "\n".join(text_parts) - - def _remove_extra_pages(self, page_limit=None): - if not page_limit: - return - - input_pdf = pymupdf.open(stream=self.file, filetype="pdf") - output_pdf = pymupdf.open() - output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1) - - temp_file = io.BytesIO() - output_pdf.save(temp_file) - - output_pdf.close() - input_pdf.close() - - self.file = temp_file - self.file.seek(0) - - def _apply_ocr(self): - doc = pymupdf.open(stream=self.file, filetype="pdf") - pages_to_ocr = [ - str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() - ] - - if not pages_to_ocr: - return - - pages = ",".join(pages_to_ocr) - - temp_file = io.BytesIO() - self.file.seek(0) - - ocrmypdf.ocr( - input_file=self.file, - output_file=temp_file, - pages=pages, - progress_bar=False, - rotate_pages=True, - force_ocr=True, - ) - - self.file = temp_file - self.file.seek(0) - - def _get_text(self): - text = "" - doc = pymupdf.open(stream=self.file, filetype="pdf") - for page in doc: - text += page.get_text("text") - - doc.close() - - return text diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py new file mode 100644 index 0000000..8d0df88 --- /dev/null +++ b/transaction_parser/transaction_parser/utils/pdf_processor.py @@ -0,0 +1,223 @@ +import io +from abc import ABC, abstractmethod + +import frappe +import pymupdf +from frappe import _ +from frappe.core.doctype.file.file import File + +DEFAULT_PDF_PROCESSOR = "OCRMyPDF" + + +class PDFProcessor(ABC): + """ + Abstract base class for PDF processors. + + To add a new processor from another app: + + 1. Subclass PDFProcessor + 2. Implement the `process` method + 3. Register it via the `pdf_processors` hook in your app's hooks.py: + + ``` + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } + ``` + """ + + @abstractmethod + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + """ + Process a PDF file and return extracted text. + + Args: + file: PDF file as BytesIO stream or Frappe File document + page_limit: Maximum number of pages to process (None = all pages) + + Returns: + Extracted text content from the PDF + """ + pass + + def get_sanitized_file( + self, file: io.BytesIO | File, page_limit: int | None = None + ) -> io.BytesIO: + """ + Get file as BytesIO stream and trim pages if needed. + """ + if isinstance(file, File): + file = io.BytesIO(file.get_content()) + + return self.trim_pages(file, page_limit) + + def trim_pages(self, file: io.BytesIO, page_limit: int | None = None) -> io.BytesIO: + if not page_limit or page_limit <= 0: + file.seek(0) + return file + + input_pdf = pymupdf.open(stream=file, filetype="pdf") + + if input_pdf.page_count <= page_limit: + input_pdf.close() + file.seek(0) + return file + + output_pdf = pymupdf.open() + output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1) + + temp_file = io.BytesIO() + output_pdf.save(temp_file) + + output_pdf.close() + input_pdf.close() + + temp_file.seek(0) + return temp_file + + def get_text(self, file: io.BytesIO) -> str: + text = "" + doc = pymupdf.open(stream=file, filetype="pdf") + + for page in doc: + text += page.get_text("text") + + doc.close() + + return text + + +class DoclingPDFProcessor(PDFProcessor): + """ + PDF processor using Docling for document conversion and text extraction. + + Docling provides advanced document understanding including table detection, + formula recognition, reading order detection, and OCR. + """ + + _converter = None + + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + from docling.datamodel.base_models import ConversionStatus, DocumentStream + + file = self.get_sanitized_file(file, page_limit) + + source = DocumentStream(name="document.pdf", stream=file) # temporary name + converter = self._get_converter() + result = converter.convert(source) + + if ( + not result + or not result.document + or result.status + not in ( + ConversionStatus.SUCCESS, + ConversionStatus.PARTIAL_SUCCESS, + ) + ): + frappe.throw( + title=_("PDF Reading Failed"), + msg=_("Docling failed to read the document."), + ) + + return result.document.export_to_markdown() + + def _get_converter(self): + if DoclingPDFProcessor._converter is None: + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False # TODO: OCR Setup + + DoclingPDFProcessor._converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), + } + ) + + return DoclingPDFProcessor._converter + + +class OCRMyPDFProcessor(PDFProcessor): + """ + PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR. + """ + + def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str: + file = self.get_sanitized_file(file, page_limit) + file = self.apply_ocr(file) + + return self.get_text(file) + + def apply_ocr(self, file: io.BytesIO) -> io.BytesIO: + import ocrmypdf + + doc = pymupdf.open(stream=file, filetype="pdf") + pages_to_ocr = [ + str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip() + ] + + doc.close() + file.seek(0) + + if not pages_to_ocr: + return file + + pages = ",".join(pages_to_ocr) + + temp_file = io.BytesIO() + + ocrmypdf.ocr( + input_file=file, + output_file=temp_file, + pages=pages, + progress_bar=False, + rotate_pages=True, + force_ocr=True, + ) + + temp_file.seek(0) + return temp_file + + +def get_pdf_processor(name: str | None = None) -> PDFProcessor: + """ + Factory function to get a PDF processor by name. + + Usage: + + ``` + processor = get_pdf_processor("Docling") + text = processor.process(file, page_limit=5) + ``` + + To register a custom processor from another app, add to its hooks.py: + + ``` + pdf_processors = { + "MyProcessor": "my_app.utils.pdf_processor.MyPDFProcessor", + } + ``` + """ + if not name: + name = ( + frappe.db.get_single_value("Transaction Parser Settings", "pdf_processor") + or DEFAULT_PDF_PROCESSOR + ) + + processors = frappe.get_hooks("pdf_processors") or {} + + # [-1] → last in resolution order app's overrides will take precedence + class_path = (processors.get(name) or [None])[-1] + + if not class_path: + frappe.throw( + title=_("Unsupported PDF Processor"), + msg=_("PDF Processor '{0}' is not supported.
Choose from: {1}").format( + name, ", ".join(processors.keys()) + ), + ) + + return frappe.get_attr(class_path)()