Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ef24085
feat: integrate docling for document data extraction
karm1000 Mar 2, 2026
e521e36
chore: ensure developer mode is respected in transaction parsing queue
Abdeali099 Mar 12, 2026
093955f
refactor: streamline file processing methods and enhance readability
Abdeali099 Mar 12, 2026
cd16721
refactor: enhance type annotations and improve method signatures in F…
Abdeali099 Mar 13, 2026
fa12e4d
refactor: streamline PDF processing methods and enhance file handling
Abdeali099 Mar 13, 2026
9628d83
fix: add page limit check for PDF processing in FileProcessor
Abdeali099 Mar 13, 2026
ae1ea00
feat: add PDF processor selection and integration for enhanced docume…
Abdeali099 Mar 13, 2026
5910f00
fix: update default PDF processor to Docling and refine description
Abdeali099 Mar 14, 2026
002b433
fix: enhance DoclingPDFProcessor with converter setup and add PDF pip…
Abdeali099 Mar 15, 2026
3d43a25
feat: implement PDF processor selection and default setting for enhan…
Abdeali099 Mar 16, 2026
7002d84
Merge branch 'version-15' into use-docling-to-extract-data
Abdeali099 Mar 16, 2026
04fedc3
fix: refactor DoclingPDFProcessor to import necessary modules locally
Abdeali099 Mar 17, 2026
c0ebb89
revert: remove unnecessary request cache decorator from get_pdf_proce…
Abdeali099 Mar 17, 2026
51b54c5
chore: add comment to clarify processor resolution order
Abdeali099 Mar 17, 2026
d15a87b
fix: optimize DoclingPDFProcessor to use a singleton converter instance
Abdeali099 Mar 17, 2026
1e9825d
fix: reset file pointer before returning in OCRMyPDFProcessor
Abdeali099 Mar 17, 2026
5b5d7f1
fix: enhance error handling for unsupported spreadsheet file types
Abdeali099 Mar 17, 2026
e9004cd
chore: fix typo
Abdeali099 Mar 17, 2026
505f7f1
fix: improve formatting and clarity in PDFProcessor documentation
Abdeali099 Mar 17, 2026
fef2bb1
fix: add check for existing PDF processor setting before setting default
Abdeali099 Mar 17, 2026
fd8d700
fix: enhance error handling in DoclingPDFProcessor for conversion status
Abdeali099 Mar 17, 2026
426332e
fix: improve CSV content decoding by refining fallback encodings
Abdeali099 Mar 17, 2026
a47f59a
chore: minor fix
Abdeali099 Mar 17, 2026
b5690d1
fix: reset file pointer in trim_pages method for proper PDF processing
Abdeali099 Mar 17, 2026
984bf6f
chore: minor change
Abdeali099 Mar 17, 2026
22b35bc
fix: improve formatting and readability in PDFProcessor documentation
Abdeali099 Mar 17, 2026
9f02aaf
chore: minor change
Abdeali099 Mar 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"rapidfuzz~=3.12.2",
"pymupdf~=1.26.3",
"openai",
"docling>=2.75.0",
]

[build-system]
Expand Down
5 changes: 5 additions & 0 deletions transaction_parser/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@
"on_update": "transaction_parser.transaction_parser.overrides.communication.on_update",
}
}

pdf_processors = {
"OCRMyPDF": "transaction_parser.transaction_parser.utils.pdf_processor.OCRMyPDFProcessor",
"Docling": "transaction_parser.transaction_parser.utils.pdf_processor.DoclingPDFProcessor",
}
3 changes: 2 additions & 1 deletion transaction_parser/patches.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@

[post_model_sync]
# Patches added in this section will be executed after doctypes are migrated
execute:from transaction_parser.install import after_install; after_install() #2
execute:from transaction_parser.install import after_install; after_install() #2
transaction_parser.patches.set_default_pdf_processor #1
Empty file.
13 changes: 13 additions & 0 deletions transaction_parser/patches/set_default_pdf_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import frappe

from transaction_parser.transaction_parser.utils.pdf_processor import (
DEFAULT_PDF_PROCESSOR,
)


def execute():
DOCTYPE = "Transaction Parser Settings"
FIELD = "pdf_processor"

if not frappe.db.get_single_value(DOCTYPE, FIELD):
frappe.db.set_single_value(DOCTYPE, FIELD, DEFAULT_PDF_PROCESSOR)
1 change: 1 addition & 0 deletions transaction_parser/transaction_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def parse(transaction, country, file_url, ai_model=None, page_limit=None):
ai_model=cstr(ai_model),
page_limit=cint(page_limit),
queue="long",
now=frappe.conf.developer_mode,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def get_api_key(self) -> str:
_("API Key not found for model {0}").format(self.model.service_provider)
)

def get_content(self, response: dict) -> dict | str:
def get_content(self, response: dict) -> dict:
"""Extract content from API response."""
content = response["choices"][0]["message"]["content"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"enabled",
"ai_model_section",
"default_ai_model",
"pdf_processor",
"api_keys",
"transaction_configurations_section",
"invoice_lookback_count",
Expand Down Expand Up @@ -92,6 +93,15 @@
"mandatory_depends_on": "eval: doc.enabled",
"options": "DeepSeek Chat\nDeepSeek Reasoner\nOpenAI gpt-4o\nOpenAI gpt-4o-mini\nOpenAI gpt-5\nOpenAI gpt-5-mini\nGoogle Gemini Pro\nGoogle Gemini Flash"
},
{
"default": "OCRMyPDF",
"depends_on": "eval: doc.enabled",
"description": "Select the library to use for PDF text extraction",
"fieldname": "pdf_processor",
"fieldtype": "Select",
"label": "PDF Processor",
"options": "OCRMyPDF\nDocling"
},
{
"depends_on": "eval: doc.enabled",
"fieldname": "api_keys",
Expand Down Expand Up @@ -156,7 +166,7 @@
"index_web_pages_for_search": 1,
"issingle": 1,
"links": [],
"modified": "2025-09-08 08:48:58.870032",
"modified": "2026-03-14 13:35:17.150533",
"modified_by": "Administrator",
"module": "Transaction Parser",
"name": "Transaction Parser Settings",
Expand All @@ -177,4 +187,4 @@
"sort_field": "modified",
"sort_order": "DESC",
"states": []
}
}
151 changes: 63 additions & 88 deletions transaction_parser/transaction_parser/utils/file_processor.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,104 @@
import io

import frappe
import ocrmypdf
import pymupdf
from frappe import _
from frappe.core.doctype.file.file import File
from frappe.utils.csvutils import read_csv_content
from frappe.utils.xlsxutils import (
read_xls_file_from_attached_file,
read_xlsx_file_from_attached_file,
)

from transaction_parser.transaction_parser.utils.pdf_processor import (
PDFProcessor,
get_pdf_processor,
)

class FileProcessor:
"""Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content."""

def get_content(self, doc, page_limit=None):
class FileProcessor:
"""
Process files: PDF (trim pages, apply OCR), CSV/Excel (parse data), extract content.
"""

def get_content(
self,
doc: File,
page_limit: int | None = None,
pdf_processor: PDFProcessor | None = None,
) -> str | None:
if doc.file_type == "PDF":
return self._process_pdf(doc, page_limit)
elif doc.file_type in ["CSV", "XLSX", "XLS"]:
return self._process_spreadsheet(doc)
else:
frappe.throw(_("Only PDF, CSV, and Excel files are supported"))
return self.process_pdf(doc, page_limit, pdf_processor)

def _process_pdf(self, doc, page_limit=None):
"""Process PDF files with OCR and page limiting."""
self.file = io.BytesIO(doc.get_content())
self._remove_extra_pages(page_limit)
self._apply_ocr()
return self._get_text()
if doc.file_type in ("CSV", "XLSX", "XLS"):
return self.process_spreadsheet(doc)

def _process_spreadsheet(self, doc):
"""Process CSV and Excel files."""
frappe.throw(
title=_("Unsupported File Type"),
msg=_("Only PDF, CSV, and Excel files are supported"),
)

def process_pdf(
self,
doc: File,
page_limit: int | None = None,
pdf_processor: PDFProcessor | None = None,
) -> str:
"""
Process PDF files using the configured PDF processor strategy.
"""
pdf_processor = pdf_processor or get_pdf_processor()
return pdf_processor.process(doc, page_limit)

def process_spreadsheet(self, doc: File) -> str:
"""
Process CSV and Excel files.
"""
file_content = doc.get_content()

if doc.file_type == "CSV":
file_content_str = self._decode_csv_content(file_content)
file_content_str = self.decode_csv_content(file_content)
rows = read_csv_content(file_content_str)
elif doc.file_type == "XLSX":
rows = read_xlsx_file_from_attached_file(fcontent=file_content)
elif doc.file_type == "XLS":
rows = read_xls_file_from_attached_file(file_content)
else:
frappe.throw(
title=_("Unsupported File Type"),
msg=_(
"Cannot process spreadsheet with file type: {0}. <br> Supported types are CSV, XLSX, and XLS."
).format(doc.file_type),
)

# Convert rows to a formatted string representation
return self._format_rows_as_text(rows)
return self.format_rows_as_text(rows)

def _decode_csv_content(self, content):
"""Decode CSV file content with fallback encodings."""
def decode_csv_content(self, content: str | bytes) -> str:
"""
Decode CSV file content with fallback encodings.
"""
# If content is already a string, return as-is
if isinstance(content, str):
return content

# If content is bytes, decode it
encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"]

for encoding in encodings:
# ! Note: Always keep `latin1` as the last fallback encoding, as it can decode any byte sequence without errors (Garbage)
for encoding in ("utf-8", "utf-8-sig", "cp1252", "latin1"):
try:
return content.decode(encoding)
except UnicodeDecodeError:
continue

# If all encodings fail, try with error handling
try:
return content.decode("utf-8", errors="replace")
except Exception:
frappe.throw(
_(
"Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
)
frappe.throw(
_(
"Unable to decode CSV file. Please ensure the file is saved with a supported encoding."
)
)

def _format_rows_as_text(self, rows):
"""Convert rows to a text format suitable for AI processing."""
def format_rows_as_text(self, rows: list) -> str:
"""
Convert rows to a text format suitable for AI processing.
"""
if not rows:
frappe.throw(_("No data found in the file."))

Expand Down Expand Up @@ -106,56 +134,3 @@ def _format_rows_as_text(self, rows):
text_parts.append(f"Total columns: {len(rows[0])}")

return "\n".join(text_parts)

def _remove_extra_pages(self, page_limit=None):
if not page_limit:
return

input_pdf = pymupdf.open(stream=self.file, filetype="pdf")
output_pdf = pymupdf.open()
output_pdf.insert_pdf(input_pdf, to_page=page_limit - 1)

temp_file = io.BytesIO()
output_pdf.save(temp_file)

output_pdf.close()
input_pdf.close()

self.file = temp_file
self.file.seek(0)

def _apply_ocr(self):
doc = pymupdf.open(stream=self.file, filetype="pdf")
pages_to_ocr = [
str(i) for i, page in enumerate(doc, 1) if not page.get_text("text").strip()
]

if not pages_to_ocr:
return

pages = ",".join(pages_to_ocr)

temp_file = io.BytesIO()
self.file.seek(0)

ocrmypdf.ocr(
input_file=self.file,
output_file=temp_file,
pages=pages,
progress_bar=False,
rotate_pages=True,
force_ocr=True,
)

self.file = temp_file
self.file.seek(0)

def _get_text(self):
text = ""
doc = pymupdf.open(stream=self.file, filetype="pdf")
for page in doc:
text += page.get_text("text")

doc.close()

return text
Loading
Loading