Skip to content

Commit f47d341

Browse files
authored
Merge pull request #139 from longieirl/fix/109-pdf-extractor-options
refactor(#109): introduce PDFExtractorOptions to reduce PDFTableExtractor constructor params
2 parents bfd42f7 + 2d24bdb commit f47d341

8 files changed

Lines changed: 192 additions & 102 deletions

File tree

packages/parser-core/src/bankstatements_core/commands/analyze_pdf.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from bankstatements_core.analysis.iban_spatial_filter import IBANSpatialFilter
2323
from bankstatements_core.analysis.table_detector import TableDetector
2424
from bankstatements_core.analysis.template_generator import TemplateGenerator
25+
from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
2526
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
2627

2728
logger = logging.getLogger(__name__)
@@ -290,10 +291,14 @@ def _validate_extraction(self, pdf: Any, template_path: Path) -> None:
290291
# This bypasses entitlement system
291292
extractor = PDFTableExtractor(
292293
columns=columns,
293-
table_top_y=table_top_y,
294-
table_bottom_y=table_bottom_y,
295-
header_check_top_y=extraction_config.get("header_check_top_y", 0),
296-
enable_header_check=extraction_config.get("enable_header_check", True),
294+
options=PDFExtractorOptions(
295+
table_top_y=table_top_y,
296+
table_bottom_y=table_bottom_y,
297+
header_check_top_y=extraction_config.get("header_check_top_y", 0),
298+
enable_header_check=extraction_config.get(
299+
"enable_header_check", True
300+
),
301+
),
297302
)
298303

299304
# Extract from first page only for validation

packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212

1313
from bankstatements_core.config.column_config import DEFAULT_COLUMNS
1414
from bankstatements_core.domain import ExtractionResult
15-
from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y
15+
from bankstatements_core.extraction.extraction_params import (
16+
TABLE_BOTTOM_Y,
17+
TABLE_TOP_Y,
18+
PDFExtractorOptions,
19+
)
1620

1721
if TYPE_CHECKING:
1822
from bankstatements_core.extraction.row_classifiers import RowClassifier
@@ -122,15 +126,17 @@ def extract_tables_from_pdf( # noqa: PLR0913
122126

123127
extractor = PDFTableExtractor(
124128
columns=columns,
125-
table_top_y=table_top_y,
126-
table_bottom_y=table_bottom_y,
127-
enable_dynamic_boundary=enable_dynamic_boundary,
128-
enable_page_validation=enable_page_validation,
129-
enable_header_check=enable_header_check,
130-
header_check_top_y=header_check_top_y,
131-
extraction_config=template.extraction if template is not None else None,
132-
template=template, # NEW: Pass template for document type
133-
entitlements=entitlements,
129+
options=PDFExtractorOptions(
130+
table_top_y=table_top_y,
131+
table_bottom_y=table_bottom_y,
132+
enable_dynamic_boundary=enable_dynamic_boundary,
133+
enable_page_validation=enable_page_validation,
134+
enable_header_check=enable_header_check,
135+
header_check_top_y=header_check_top_y,
136+
extraction_config=template.extraction if template is not None else None,
137+
template=template,
138+
entitlements=entitlements,
139+
),
134140
)
135141

136142
return extractor.extract(pdf_path)

packages/parser-core/src/bankstatements_core/extraction/extraction_params.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@
77

88
from __future__ import annotations
99

10+
from dataclasses import dataclass, field
11+
from typing import TYPE_CHECKING, Any
12+
1013
from bankstatements_core.config.environment_parser import EnvironmentParser
1114

15+
if TYPE_CHECKING:
16+
from bankstatements_core.templates.template_model import BankTemplate
17+
1218
# ---- Table vertical bounds ----
1319
TABLE_TOP_Y = 300
1420
TABLE_BOTTOM_Y = 720
@@ -48,3 +54,22 @@
4854
"ADMINISTRATIVE_PATTERNS",
4955
["BALANCE FORWARD", "Interest Rate", "Lending @"],
5056
)
57+
58+
59+
@dataclass
60+
class PDFExtractorOptions:
61+
"""Configuration options for PDFTableExtractor.
62+
63+
Groups the optional parameters so the constructor signature stays
64+
within pylint's design limit (R0913/R0917).
65+
"""
66+
67+
table_top_y: int = TABLE_TOP_Y
68+
table_bottom_y: int = TABLE_BOTTOM_Y
69+
enable_dynamic_boundary: bool = False
70+
enable_page_validation: bool = True
71+
enable_header_check: bool = True
72+
header_check_top_y: int | None = None
73+
extraction_config: Any | None = None
74+
template: BankTemplate | None = field(default=None)
75+
entitlements: Any | None = None

packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
CODE_CREDIT_CARD_SKIPPED,
2121
ExtractionWarning,
2222
)
23+
from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
2324
from bankstatements_core.extraction.iban_extractor import IBANExtractor
2425
from bankstatements_core.extraction.page_header_analyser import PageHeaderAnalyser
2526
from bankstatements_core.extraction.row_builder import RowBuilder
@@ -43,30 +44,23 @@ class PDFTableExtractor:
4344
- RowPostProcessor: date propagation and metadata tagging
4445
"""
4546

46-
def __init__( # noqa: PLR0913 # pylint: disable=too-many-arguments,too-many-positional-arguments
47+
def __init__(
4748
self,
4849
columns: dict[str, tuple[int | float, int | float]],
49-
table_top_y: int = 300,
50-
table_bottom_y: int = 720,
51-
enable_dynamic_boundary: bool = False,
52-
enable_page_validation: bool = True,
53-
enable_header_check: bool = True,
54-
header_check_top_y: int | None = None,
50+
options: PDFExtractorOptions | None = None,
5551
pdf_reader: IPDFReader | None = None,
56-
extraction_config: Any | None = None,
57-
template: Any | None = None,
58-
entitlements: Any | None = None,
5952
):
53+
opts = options or PDFExtractorOptions()
6054
self.columns = columns
61-
self.table_top_y = table_top_y
62-
self.table_bottom_y = table_bottom_y
63-
self.enable_dynamic_boundary = enable_dynamic_boundary
64-
self.page_validation_enabled = enable_page_validation
65-
self.header_check_enabled = enable_header_check
66-
self.header_check_top_y = header_check_top_y
67-
self.extraction_config = extraction_config
68-
self.template = template
69-
self._entitlements = entitlements
55+
self.table_top_y = opts.table_top_y
56+
self.table_bottom_y = opts.table_bottom_y
57+
self.enable_dynamic_boundary = opts.enable_dynamic_boundary
58+
self.page_validation_enabled = opts.enable_page_validation
59+
self.header_check_enabled = opts.enable_header_check
60+
self.header_check_top_y = opts.header_check_top_y
61+
self.extraction_config = opts.extraction_config
62+
self.template = opts.template
63+
self._entitlements = opts.entitlements
7064

7165
self._row_classifier = create_row_classifier_chain()
7266
self._row_builder = RowBuilder(columns, self._row_classifier)

packages/parser-core/tests/extraction/test_document_type_enrichment.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import pytest
1010

11+
from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
1112
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
1213
from bankstatements_core.extraction.row_post_processor import RowPostProcessor
1314
from bankstatements_core.templates.template_model import (
@@ -116,7 +117,8 @@ class TestDocumentTypeEnrichment:
116117
def test_bank_statement_document_type(self, bank_statement_template, basic_columns):
117118
"""Test that bank statement template adds correct document_type."""
118119
extractor = PDFTableExtractor(
119-
columns=basic_columns, template=bank_statement_template
120+
columns=basic_columns,
121+
options=PDFExtractorOptions(template=bank_statement_template),
120122
)
121123
proc = RowPostProcessor(
122124
columns=basic_columns,
@@ -134,7 +136,8 @@ def test_bank_statement_document_type(self, bank_statement_template, basic_colum
134136
def test_credit_card_document_type(self, credit_card_template, basic_columns):
135137
"""Test that credit card template adds correct document_type."""
136138
extractor = PDFTableExtractor(
137-
columns=basic_columns, template=credit_card_template
139+
columns=basic_columns,
140+
options=PDFExtractorOptions(template=credit_card_template),
138141
)
139142
proc = RowPostProcessor(
140143
columns=basic_columns,
@@ -152,7 +155,8 @@ def test_credit_card_document_type(self, credit_card_template, basic_columns):
152155
def test_loan_statement_document_type(self, loan_statement_template, basic_columns):
153156
"""Test that loan statement template adds correct document_type."""
154157
extractor = PDFTableExtractor(
155-
columns=basic_columns, template=loan_statement_template
158+
columns=basic_columns,
159+
options=PDFExtractorOptions(template=loan_statement_template),
156160
)
157161
proc = RowPostProcessor(
158162
columns=basic_columns,
@@ -169,7 +173,7 @@ def test_loan_statement_document_type(self, loan_statement_template, basic_colum
169173

170174
def test_no_template_defaults_to_bank_statement(self, basic_columns):
171175
"""Test that absence of template defaults to 'bank_statement'."""
172-
extractor = PDFTableExtractor(columns=basic_columns, template=None)
176+
extractor = PDFTableExtractor(columns=basic_columns)
173177
proc = RowPostProcessor(
174178
columns=basic_columns,
175179
row_classifier=extractor._row_classifier,
@@ -187,7 +191,8 @@ def test_document_type_preserved_with_filename(
187191
):
188192
"""Test that document_type is added alongside Filename."""
189193
extractor = PDFTableExtractor(
190-
columns=basic_columns, template=credit_card_template
194+
columns=basic_columns,
195+
options=PDFExtractorOptions(template=credit_card_template),
191196
)
192197
proc = RowPostProcessor(
193198
columns=basic_columns,
@@ -210,7 +215,8 @@ class TestDocumentTypeIntegration:
210215
def test_multiple_rows_same_template(self, credit_card_template, basic_columns):
211216
"""Test that multiple rows from same template have same document_type."""
212217
extractor = PDFTableExtractor(
213-
columns=basic_columns, template=credit_card_template
218+
columns=basic_columns,
219+
options=PDFExtractorOptions(template=credit_card_template),
214220
)
215221
proc = RowPostProcessor(
216222
columns=basic_columns,
@@ -233,7 +239,8 @@ def test_document_type_field_is_string(
233239
):
234240
"""Test that document_type field is always a string."""
235241
extractor = PDFTableExtractor(
236-
columns=basic_columns, template=bank_statement_template
242+
columns=basic_columns,
243+
options=PDFExtractorOptions(template=bank_statement_template),
237244
)
238245
proc = RowPostProcessor(
239246
columns=basic_columns,

packages/parser-core/tests/extraction/test_page_skipping.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pathlib import Path
1212
from unittest.mock import MagicMock, patch
1313

14+
from bankstatements_core.extraction.extraction_params import PDFExtractorOptions
1415
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
1516

1617
# Test columns configuration
@@ -90,7 +91,8 @@ def test_skip_page_without_headers_and_continue(self, mock_pdfplumber):
9091

9192
# Extract with default settings (validation enabled)
9293
extractor = PDFTableExtractor(
93-
columns=TEST_COLUMNS, enable_dynamic_boundary=True
94+
columns=TEST_COLUMNS,
95+
options=PDFExtractorOptions(enable_dynamic_boundary=True),
9496
)
9597
result = extractor.extract(Path("/tmp/test.pdf"))
9698

@@ -133,7 +135,8 @@ def test_skip_all_pages_without_tables(self, mock_pdfplumber):
133135
mock_cropped2.extract_words.return_value = mock_words2
134136

135137
extractor = PDFTableExtractor(
136-
columns=TEST_COLUMNS, enable_dynamic_boundary=True
138+
columns=TEST_COLUMNS,
139+
options=PDFExtractorOptions(enable_dynamic_boundary=True),
137140
)
138141
result = extractor.extract(Path("/tmp/test.pdf"))
139142

@@ -170,7 +173,8 @@ def test_process_all_pages_with_tables(self, mock_pdfplumber):
170173
mock_cropped.extract_words.return_value = mock_words
171174

172175
extractor = PDFTableExtractor(
173-
columns=TEST_COLUMNS, enable_dynamic_boundary=True
176+
columns=TEST_COLUMNS,
177+
options=PDFExtractorOptions(enable_dynamic_boundary=True),
174178
)
175179
result = extractor.extract(Path("/tmp/test.pdf"))
176180

@@ -230,7 +234,8 @@ def test_skip_middle_page_without_table(self, mock_pdfplumber):
230234
mock_cropped3.extract_words.return_value = mock_words3
231235

232236
extractor = PDFTableExtractor(
233-
columns=TEST_COLUMNS, enable_dynamic_boundary=True
237+
columns=TEST_COLUMNS,
238+
options=PDFExtractorOptions(enable_dynamic_boundary=True),
234239
)
235240
result = extractor.extract(Path("/tmp/test.pdf"))
236241

@@ -258,7 +263,8 @@ def test_page_validation_disabled_processes_all_pages(self, mock_pdfplumber):
258263

259264
# With validation disabled
260265
extractor = PDFTableExtractor(
261-
columns=TEST_COLUMNS, enable_page_validation=False
266+
columns=TEST_COLUMNS,
267+
options=PDFExtractorOptions(enable_page_validation=False),
262268
)
263269
result = extractor.extract(Path("/tmp/test.pdf"))
264270

@@ -288,7 +294,8 @@ def test_validation_enabled_by_default(self, mock_pdfplumber):
288294

289295
# Create extractor with defaults (should have validation enabled)
290296
extractor = PDFTableExtractor(
291-
columns=TEST_COLUMNS, enable_dynamic_boundary=True
297+
columns=TEST_COLUMNS,
298+
options=PDFExtractorOptions(enable_dynamic_boundary=True),
292299
)
293300

294301
# Verify both validations are enabled by default
@@ -333,7 +340,10 @@ def test_page_with_insufficient_rows_skipped(self, mock_pdfplumber):
333340
]
334341
mock_cropped.extract_words.return_value = mock_words
335342

336-
extractor = PDFTableExtractor(columns=TEST_COLUMNS, enable_page_validation=True)
343+
extractor = PDFTableExtractor(
344+
columns=TEST_COLUMNS,
345+
options=PDFExtractorOptions(enable_page_validation=True),
346+
)
337347
result = extractor.extract(Path("/tmp/test.pdf"))
338348

339349
assert result.page_count == 1

0 commit comments

Comments
 (0)