Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from bankstatements_core.analysis.iban_spatial_filter import IBANSpatialFilter
from bankstatements_core.analysis.table_detector import TableDetector
from bankstatements_core.analysis.template_generator import TemplateGenerator
from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -290,10 +291,14 @@ def _validate_extraction(self, pdf: Any, template_path: Path) -> None:
# This bypasses entitlement system
extractor = PDFTableExtractor(
columns=columns,
table_top_y=table_top_y,
table_bottom_y=table_bottom_y,
header_check_top_y=extraction_config.get("header_check_top_y", 0),
enable_header_check=extraction_config.get("enable_header_check", True),
options=PDFExtractorOptions(
table_top_y=table_top_y,
table_bottom_y=table_bottom_y,
header_check_top_y=extraction_config.get("header_check_top_y", 0),
enable_header_check=extraction_config.get(
"enable_header_check", True
),
),
)

# Extract from first page only for validation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@

from bankstatements_core.config.column_config import DEFAULT_COLUMNS
from bankstatements_core.domain import ExtractionResult
from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y
from bankstatements_core.extraction.extraction_params import (
TABLE_BOTTOM_Y,
TABLE_TOP_Y,
PDFExtractorOptions,
)

if TYPE_CHECKING:
from bankstatements_core.extraction.row_classifiers import RowClassifier
Expand Down Expand Up @@ -122,15 +126,17 @@ def extract_tables_from_pdf( # noqa: PLR0913

extractor = PDFTableExtractor(
columns=columns,
table_top_y=table_top_y,
table_bottom_y=table_bottom_y,
enable_dynamic_boundary=enable_dynamic_boundary,
enable_page_validation=enable_page_validation,
enable_header_check=enable_header_check,
header_check_top_y=header_check_top_y,
extraction_config=template.extraction if template is not None else None,
template=template, # NEW: Pass template for document type
entitlements=entitlements,
options=PDFExtractorOptions(
table_top_y=table_top_y,
table_bottom_y=table_bottom_y,
enable_dynamic_boundary=enable_dynamic_boundary,
enable_page_validation=enable_page_validation,
enable_header_check=enable_header_check,
header_check_top_y=header_check_top_y,
extraction_config=template.extraction if template is not None else None,
template=template,
entitlements=entitlements,
),
)

return extractor.extract(pdf_path)
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@

from __future__ import annotations

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any

from bankstatements_core.config.environment_parser import EnvironmentParser

if TYPE_CHECKING:
from bankstatements_core.templates.template_model import BankTemplate

# ---- Table vertical bounds ----
TABLE_TOP_Y = 300
TABLE_BOTTOM_Y = 720
Expand Down Expand Up @@ -48,3 +54,22 @@
"ADMINISTRATIVE_PATTERNS",
["BALANCE FORWARD", "Interest Rate", "Lending @"],
)


@dataclass
class PDFExtractorOptions:
"""Configuration options for PDFTableExtractor.

Groups the optional parameters so the constructor signature stays
within pylint's design limit (R0913/R0917).
"""

table_top_y: int = TABLE_TOP_Y
table_bottom_y: int = TABLE_BOTTOM_Y
enable_dynamic_boundary: bool = False
enable_page_validation: bool = True
enable_header_check: bool = True
header_check_top_y: int | None = None
extraction_config: Any | None = None
template: BankTemplate | None = field(default=None)
entitlements: Any | None = None
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
CODE_CREDIT_CARD_SKIPPED,
ExtractionWarning,
)
from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
from bankstatements_core.extraction.iban_extractor import IBANExtractor
from bankstatements_core.extraction.page_header_analyser import PageHeaderAnalyser
from bankstatements_core.extraction.row_builder import RowBuilder
Expand All @@ -43,30 +44,23 @@ class PDFTableExtractor:
- RowPostProcessor: date propagation and metadata tagging
"""

def __init__( # noqa: PLR0913 # pylint: disable=too-many-arguments,too-many-positional-arguments
def __init__(
self,
columns: dict[str, tuple[int | float, int | float]],
table_top_y: int = 300,
table_bottom_y: int = 720,
enable_dynamic_boundary: bool = False,
enable_page_validation: bool = True,
enable_header_check: bool = True,
header_check_top_y: int | None = None,
options: PDFExtractorOptions | None = None,
pdf_reader: IPDFReader | None = None,
extraction_config: Any | None = None,
template: Any | None = None,
entitlements: Any | None = None,
):
opts = options or PDFExtractorOptions()
self.columns = columns
self.table_top_y = table_top_y
self.table_bottom_y = table_bottom_y
self.enable_dynamic_boundary = enable_dynamic_boundary
self.page_validation_enabled = enable_page_validation
self.header_check_enabled = enable_header_check
self.header_check_top_y = header_check_top_y
self.extraction_config = extraction_config
self.template = template
self._entitlements = entitlements
self.table_top_y = opts.table_top_y
self.table_bottom_y = opts.table_bottom_y
self.enable_dynamic_boundary = opts.enable_dynamic_boundary
self.page_validation_enabled = opts.enable_page_validation
self.header_check_enabled = opts.enable_header_check
self.header_check_top_y = opts.header_check_top_y
self.extraction_config = opts.extraction_config
self.template = opts.template
self._entitlements = opts.entitlements

self._row_classifier = create_row_classifier_chain()
self._row_builder = RowBuilder(columns, self._row_classifier)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pytest

from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
from bankstatements_core.extraction.row_post_processor import RowPostProcessor
from bankstatements_core.templates.template_model import (
Expand Down Expand Up @@ -116,7 +117,8 @@ class TestDocumentTypeEnrichment:
def test_bank_statement_document_type(self, bank_statement_template, basic_columns):
"""Test that bank statement template adds correct document_type."""
extractor = PDFTableExtractor(
columns=basic_columns, template=bank_statement_template
columns=basic_columns,
options=PDFExtractorOptions(template=bank_statement_template),
)
proc = RowPostProcessor(
columns=basic_columns,
Expand All @@ -134,7 +136,8 @@ def test_bank_statement_document_type(self, bank_statement_template, basic_colum
def test_credit_card_document_type(self, credit_card_template, basic_columns):
"""Test that credit card template adds correct document_type."""
extractor = PDFTableExtractor(
columns=basic_columns, template=credit_card_template
columns=basic_columns,
options=PDFExtractorOptions(template=credit_card_template),
)
proc = RowPostProcessor(
columns=basic_columns,
Expand All @@ -152,7 +155,8 @@ def test_credit_card_document_type(self, credit_card_template, basic_columns):
def test_loan_statement_document_type(self, loan_statement_template, basic_columns):
"""Test that loan statement template adds correct document_type."""
extractor = PDFTableExtractor(
columns=basic_columns, template=loan_statement_template
columns=basic_columns,
options=PDFExtractorOptions(template=loan_statement_template),
)
proc = RowPostProcessor(
columns=basic_columns,
Expand All @@ -169,7 +173,7 @@ def test_loan_statement_document_type(self, loan_statement_template, basic_colum

def test_no_template_defaults_to_bank_statement(self, basic_columns):
"""Test that absence of template defaults to 'bank_statement'."""
extractor = PDFTableExtractor(columns=basic_columns, template=None)
extractor = PDFTableExtractor(columns=basic_columns)
proc = RowPostProcessor(
columns=basic_columns,
row_classifier=extractor._row_classifier,
Expand All @@ -187,7 +191,8 @@ def test_document_type_preserved_with_filename(
):
"""Test that document_type is added alongside Filename."""
extractor = PDFTableExtractor(
columns=basic_columns, template=credit_card_template
columns=basic_columns,
options=PDFExtractorOptions(template=credit_card_template),
)
proc = RowPostProcessor(
columns=basic_columns,
Expand All @@ -210,7 +215,8 @@ class TestDocumentTypeIntegration:
def test_multiple_rows_same_template(self, credit_card_template, basic_columns):
"""Test that multiple rows from same template have same document_type."""
extractor = PDFTableExtractor(
columns=basic_columns, template=credit_card_template
columns=basic_columns,
options=PDFExtractorOptions(template=credit_card_template),
)
proc = RowPostProcessor(
columns=basic_columns,
Expand All @@ -233,7 +239,8 @@ def test_document_type_field_is_string(
):
"""Test that document_type field is always a string."""
extractor = PDFTableExtractor(
columns=basic_columns, template=bank_statement_template
columns=basic_columns,
options=PDFExtractorOptions(template=bank_statement_template),
)
proc = RowPostProcessor(
columns=basic_columns,
Expand Down
24 changes: 17 additions & 7 deletions packages/parser-core/tests/extraction/test_page_skipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pathlib import Path
from unittest.mock import MagicMock, patch

from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor

# Test columns configuration
Expand Down Expand Up @@ -90,7 +91,8 @@ def test_skip_page_without_headers_and_continue(self, mock_pdfplumber):

# Extract with default settings (validation enabled)
extractor = PDFTableExtractor(
columns=TEST_COLUMNS, enable_dynamic_boundary=True
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_dynamic_boundary=True),
)
result = extractor.extract(Path("/tmp/test.pdf"))

Expand Down Expand Up @@ -133,7 +135,8 @@ def test_skip_all_pages_without_tables(self, mock_pdfplumber):
mock_cropped2.extract_words.return_value = mock_words2

extractor = PDFTableExtractor(
columns=TEST_COLUMNS, enable_dynamic_boundary=True
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_dynamic_boundary=True),
)
result = extractor.extract(Path("/tmp/test.pdf"))

Expand Down Expand Up @@ -170,7 +173,8 @@ def test_process_all_pages_with_tables(self, mock_pdfplumber):
mock_cropped.extract_words.return_value = mock_words

extractor = PDFTableExtractor(
columns=TEST_COLUMNS, enable_dynamic_boundary=True
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_dynamic_boundary=True),
)
result = extractor.extract(Path("/tmp/test.pdf"))

Expand Down Expand Up @@ -230,7 +234,8 @@ def test_skip_middle_page_without_table(self, mock_pdfplumber):
mock_cropped3.extract_words.return_value = mock_words3

extractor = PDFTableExtractor(
columns=TEST_COLUMNS, enable_dynamic_boundary=True
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_dynamic_boundary=True),
)
result = extractor.extract(Path("/tmp/test.pdf"))

Expand Down Expand Up @@ -258,7 +263,8 @@ def test_page_validation_disabled_processes_all_pages(self, mock_pdfplumber):

# With validation disabled
extractor = PDFTableExtractor(
columns=TEST_COLUMNS, enable_page_validation=False
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_page_validation=False),
)
result = extractor.extract(Path("/tmp/test.pdf"))

Expand Down Expand Up @@ -288,7 +294,8 @@ def test_validation_enabled_by_default(self, mock_pdfplumber):

# Create extractor with defaults (should have validation enabled)
extractor = PDFTableExtractor(
columns=TEST_COLUMNS, enable_dynamic_boundary=True
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_dynamic_boundary=True),
)

# Verify both validations are enabled by default
Expand Down Expand Up @@ -333,7 +340,10 @@ def test_page_with_insufficient_rows_skipped(self, mock_pdfplumber):
]
mock_cropped.extract_words.return_value = mock_words

extractor = PDFTableExtractor(columns=TEST_COLUMNS, enable_page_validation=True)
extractor = PDFTableExtractor(
columns=TEST_COLUMNS,
options=PDFExtractorOptions(enable_page_validation=True),
)
result = extractor.extract(Path("/tmp/test.pdf"))

assert result.page_count == 1
Expand Down
Loading