From ee3acb7e1ac5ae4bd56c7ebcd9b1bb8ce3b5db38 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 18:52:15 +0000 Subject: [PATCH 1/7] =?UTF-8?q?refactor(20-01):=20break=20service=E2=86=92?= =?UTF-8?q?shim=20circular=20imports=20in=20=5Fclassify=5Frow=5Ftype?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace row_classification_facade import in PageValidationService with direct row_classifiers import - Replace row_classification_facade import in ContentDensityService with direct row_classifiers import - Replace row_classification_facade import in RowMergerService with direct row_classifiers import - Use cached _classifier instance per object via hasattr guard --- .../src/bankstatements_core/services/content_density.py | 9 +++++---- .../src/bankstatements_core/services/page_validation.py | 9 +++++---- .../src/bankstatements_core/services/row_merger.py | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/services/content_density.py b/packages/parser-core/src/bankstatements_core/services/content_density.py index 3459e90..59ea329 100644 --- a/packages/parser-core/src/bankstatements_core/services/content_density.py +++ b/packages/parser-core/src/bankstatements_core/services/content_density.py @@ -111,9 +111,10 @@ def _classify_row_type( Returns: String classification: 'transaction', etc. """ - # Import here to avoid circular dependency - from bankstatements_core.extraction.row_classification_facade import ( - classify_row_type, + from bankstatements_core.extraction.row_classifiers import ( + create_row_classifier_chain, ) - return classify_row_type(row, columns) + if not hasattr(self, "_classifier"): + self._classifier = create_row_classifier_chain() + return self._classifier.classify(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/services/page_validation.py b/packages/parser-core/src/bankstatements_core/services/page_validation.py index 466727d..5347886 100644 --- a/packages/parser-core/src/bankstatements_core/services/page_validation.py +++ b/packages/parser-core/src/bankstatements_core/services/page_validation.py @@ -189,9 +189,10 @@ def _classify_row_type( Returns: String classification: 'transaction', etc. """ - # Import here to avoid circular dependency - from bankstatements_core.extraction.row_classification_facade import ( - classify_row_type, + from bankstatements_core.extraction.row_classifiers import ( + create_row_classifier_chain, ) - return classify_row_type(row, columns) + if not hasattr(self, "_classifier"): + self._classifier = create_row_classifier_chain() + return self._classifier.classify(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index f4cfcb8..fa627a5 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -157,12 +157,13 @@ def _classify_row_type( Returns: String classification: 'transaction', 'continuation', etc. """ - # Import here to avoid circular dependency - from bankstatements_core.extraction.row_classification_facade import ( - classify_row_type, + from bankstatements_core.extraction.row_classifiers import ( + create_row_classifier_chain, ) - return classify_row_type(row, columns) + if not hasattr(self, "_classifier"): + self._classifier = create_row_classifier_chain() + return self._classifier.classify(row, columns) def _preserve_balance_from_continuation( self, From 851258aabe19050daae16bed2e9020e1eded6f87 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 18:56:16 +0000 Subject: [PATCH 2/7] refactor(20-02): delete three thin facade pass-throughs; rewire shim to service singletons - Delete content_analysis_facade.py, validation_facade.py, row_classification_facade.py - Rewire pdf_table_extractor.py to import directly from services using module-level singletons - Remove _looks_like_date and calculate_row_completeness_score from __all__ - Fix pdf_extractor.py to import from services (validation, header_detection) directly - Rule 3 auto-fix: pdf_extractor.py used validation_facade which would have broken on deletion --- .../extraction/content_analysis_facade.py | 33 ---- .../extraction/pdf_extractor.py | 34 +++-- .../extraction/row_classification_facade.py | 69 --------- .../extraction/validation_facade.py | 125 --------------- .../pdf_table_extractor.py | 144 +++++++++++++++--- 5 files changed, 140 insertions(+), 265 deletions(-) delete mode 100644 packages/parser-core/src/bankstatements_core/extraction/content_analysis_facade.py delete mode 100644 packages/parser-core/src/bankstatements_core/extraction/row_classification_facade.py delete mode 100644 packages/parser-core/src/bankstatements_core/extraction/validation_facade.py diff --git a/packages/parser-core/src/bankstatements_core/extraction/content_analysis_facade.py b/packages/parser-core/src/bankstatements_core/extraction/content_analysis_facade.py deleted file mode 100644 index 319a98e..0000000 --- a/packages/parser-core/src/bankstatements_core/extraction/content_analysis_facade.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Content analysis facade for PDF extraction. - -This module provides a simplified interface for analyzing content -density in extracted PDFs. Extracted from pdf_table_extractor.py -to improve separation of concerns. -""" - -from __future__ import annotations - -from bankstatements_core.extraction.extraction_params import SLIDING_WINDOW_SIZE - - -def analyze_content_density( - word_groups: dict[float, list[dict]], - columns: dict[str, tuple[int | float, int | float]], - window_size: int = SLIDING_WINDOW_SIZE, -) -> list[tuple[float, float]]: - """Calculate transaction density in sliding windows (backward compat wrapper). - - This function now delegates to ContentDensityService. - - Args: - word_groups: Words grouped by Y-coordinate - columns: Column definitions for row processing - window_size: Number of rows to analyze together - - Returns: - List of tuples (y_coordinate, density_score) - """ - from bankstatements_core.services.content_density import ContentDensityService - - service = ContentDensityService(window_size=window_size) - return service.analyze_content_density(word_groups, columns) diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index e3bf110..1900138 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -144,22 +144,20 @@ def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: page_rows = self._row_builder.build_rows(words) if self.page_validation_enabled: - from bankstatements_core.extraction.validation_facade import ( - validate_page_structure, + from bankstatements_core.services.page_validation import ( + PageValidationService, ) - if not validate_page_structure(page_rows, self.columns): + if not PageValidationService().validate_page_structure(page_rows, self.columns): logger.info( f"Page {page_num}: Invalid table structure detected, " f"skipping {len(page_rows)} rows" ) return None - from bankstatements_core.extraction.validation_facade import ( - merge_continuation_lines, - ) + from bankstatements_core.services.row_merger import RowMergerService - return merge_continuation_lines(page_rows, self.columns) + return RowMergerService().merge_continuation_lines(page_rows, self.columns) def _determine_boundaries_and_extract( self, page: Any, page_num: int @@ -188,8 +186,11 @@ def _determine_boundaries_and_extract( all_words = initial_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.extraction.validation_facade import ( - detect_table_headers, + from bankstatements_core.services.header_detection import ( + HeaderDetectionService, + ) + from bankstatements_core.extraction.extraction_params import ( + MIN_HEADER_KEYWORDS, ) header_top = ( @@ -200,7 +201,9 @@ def _determine_boundaries_and_extract( header_area = page.crop((0, header_top, page.width, page.height)) header_words = header_area.extract_words(use_text_flow=True) - if not detect_table_headers(header_words, self.columns): + if not HeaderDetectionService().detect_headers( + header_words, self.columns, min_keywords=MIN_HEADER_KEYWORDS + ): logger.info(f"Page {page_num}: No table headers detected, skipping") return None @@ -231,8 +234,11 @@ def _determine_boundaries_and_extract( words = table_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.extraction.validation_facade import ( - detect_table_headers, + from bankstatements_core.services.header_detection import ( + HeaderDetectionService, + ) + from bankstatements_core.extraction.extraction_params import ( + MIN_HEADER_KEYWORDS, ) header_top = ( @@ -243,7 +249,9 @@ def _determine_boundaries_and_extract( header_area = page.crop((0, header_top, page.width, table_bottom_y)) header_words = header_area.extract_words(use_text_flow=True) - if not detect_table_headers(header_words, self.columns): + if not HeaderDetectionService().detect_headers( + header_words, self.columns, min_keywords=MIN_HEADER_KEYWORDS + ): logger.info(f"Page {page_num}: No table headers detected, skipping") return None diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_classification_facade.py b/packages/parser-core/src/bankstatements_core/extraction/row_classification_facade.py deleted file mode 100644 index 2975b45..0000000 --- a/packages/parser-core/src/bankstatements_core/extraction/row_classification_facade.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Row classification facade for PDF extraction. - -This module provides a simplified interface for classifying and analyzing -extracted PDF rows. Extracted from pdf_table_extractor.py to improve -separation of concerns. -""" - -from __future__ import annotations - -from bankstatements_core.extraction.row_classifiers import create_row_classifier_chain - -# Create the row classifier chain once at module level for reuse -_ROW_CLASSIFIER_CHAIN = create_row_classifier_chain() - - -def classify_row_type( - row: dict, columns: dict[str, tuple[int | float, int | float]] -) -> str: - """Classify row type using chain of responsibility pattern. - - Classifies rows as 'transaction', 'administrative', 'reference', - 'continuation', or 'metadata'. Delegates to a Chain of Responsibility - implementation for better maintainability and extensibility. - - Args: - row: Dictionary containing row data - columns: Column definitions for structure analysis - - Returns: - String classification - """ - return _ROW_CLASSIFIER_CHAIN.classify(row, columns) - - -def _looks_like_date(text: str) -> bool: - """Check if text looks like a valid date (backward compatibility wrapper). - - This function now delegates to RowAnalysisService for consistency. - - Args: - text: Text to check - - Returns: - True if text appears to be a date - """ - from bankstatements_core.services.row_analysis import RowAnalysisService - - service = RowAnalysisService() - return service.looks_like_date(text) - - -def calculate_row_completeness_score( - row: dict, columns: dict[str, tuple[int, int]] -) -> float: - """Score row completeness (backward compatibility wrapper). - - This function now delegates to RowAnalysisService. - - Args: - row: Dictionary containing row data - columns: Column definitions for weight calculation - - Returns: - Float score between 0.0 and 1.0 - """ - from bankstatements_core.services.row_analysis import RowAnalysisService - - service = RowAnalysisService() - return service.calculate_row_completeness_score(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/extraction/validation_facade.py b/packages/parser-core/src/bankstatements_core/extraction/validation_facade.py deleted file mode 100644 index 1e9f35b..0000000 --- a/packages/parser-core/src/bankstatements_core/extraction/validation_facade.py +++ /dev/null @@ -1,125 +0,0 @@ -"""Page validation facade for PDF extraction. - -This module provides a simplified interface for validating extracted -page structures. Extracted from pdf_table_extractor.py to improve -separation of concerns. -""" - -from __future__ import annotations - -from bankstatements_core.extraction.extraction_params import ( - MIN_COLUMN_COVERAGE, - MIN_HEADER_KEYWORDS, - MIN_TABLE_ROWS, - MIN_TRANSACTION_RATIO, - REQUIRE_AMOUNT_COLUMN, - REQUIRE_DATE_COLUMN, -) - - -def validate_page_structure( - rows: list[dict], columns: dict[str, tuple[int | float, int | float]] -) -> bool: - """Validate page structure (backward compatibility wrapper). - - This function now delegates to PageValidationService. - - Args: - rows: List of extracted rows from the page - columns: Column definitions to validate against - - Returns: - True if page contains valid table structure - """ - from bankstatements_core.services.page_validation import PageValidationService - - service = PageValidationService( - min_table_rows=MIN_TABLE_ROWS, - min_column_coverage=MIN_COLUMN_COVERAGE, - min_transaction_ratio=MIN_TRANSACTION_RATIO, - require_date_column=REQUIRE_DATE_COLUMN, - require_amount_column=REQUIRE_AMOUNT_COLUMN, - ) - - return service.validate_page_structure(rows, columns) - - -def calculate_column_coverage( - rows: list[dict], columns: dict[str, tuple[int | float, int | float]] -) -> float: - """Calculate column coverage (backward compatibility wrapper). - - This function now delegates to PageValidationService. - - Args: - rows: List of extracted rows - columns: Column definitions - - Returns: - Float between 0.0-1.0 representing column coverage - """ - from bankstatements_core.services.page_validation import PageValidationService - - service = PageValidationService() - return service.calculate_column_coverage(rows, columns) - - -def has_column_type( - columns: dict[str, tuple[int | float, int | float]], - required_types: str | list[str], -) -> bool: - """Check if columns contain required types (backward compatibility wrapper). - - This function now delegates to PageValidationService. - - Args: - columns: Column definitions - required_types: Single type string or list of acceptable types - - Returns: - True if at least one required type is present - """ - from bankstatements_core.services.page_validation import PageValidationService - - service = PageValidationService() - return service.has_column_type(columns, required_types) - - -def detect_table_headers( - words: list[dict], columns: dict[str, tuple[int | float, int | float]] -) -> bool: - """Detect table headers (backward compatibility wrapper). - - This function delegates to HeaderDetectionService. - - Args: - words: List of words from the page - columns: Expected column structure - - Returns: - True if table headers are detected - """ - from bankstatements_core.services.header_detection import HeaderDetectionService - - service = HeaderDetectionService() - return service.detect_headers(words, columns, min_keywords=MIN_HEADER_KEYWORDS) - - -def merge_continuation_lines( - rows: list[dict], columns: dict[str, tuple[int | float, int | float]] -) -> list[dict]: - """Merge continuation lines (backward compatibility wrapper). - - This function now delegates to RowMergerService. - - Args: - rows: List of extracted rows containing transactions and continuation lines - columns: Column definitions for processing - - Returns: - List of rows with continuation lines merged into parent transactions - """ - from bankstatements_core.services.row_merger import RowMergerService - - service = RowMergerService() - return service.merge_continuation_lines(rows, columns) diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index 827a46a..10418c8 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -4,9 +4,7 @@ from the new extraction modules. The implementation has been split into: - extraction_params.py: Constants and thresholds - extraction_facade.py: Main extraction functions -- row_classification_facade.py: Row classification -- content_analysis_facade.py: Content density analysis -- validation_facade.py: Page validation and header detection +- services/: Service classes for row classification, validation, etc. """ from __future__ import annotations @@ -19,9 +17,8 @@ warnings.warn( "bankstatements_core.pdf_table_extractor is a backward-compatibility shim " "and will be removed in a future version. " - "Import directly from bankstatements_core.extraction.extraction_facade, " - "bankstatements_core.extraction.validation_facade, or " - "bankstatements_core.extraction.row_classification_facade instead.", + "Import directly from bankstatements_core.extraction.extraction_facade or " + "bankstatements_core.services instead.", DeprecationWarning, stacklevel=2, ) @@ -36,11 +33,6 @@ parse_columns_from_env, ) -# Re-export content analysis functions (backward compatibility) -from bankstatements_core.extraction.content_analysis_facade import ( # noqa: E402, F401 - analyze_content_density, -) - # Re-export extraction functions (backward compatibility) from bankstatements_core.extraction.extraction_facade import ( # noqa: E402, F401 detect_table_end_boundary_smart, @@ -64,21 +56,125 @@ TABLE_TOP_Y, ) -# Re-export row classification functions (backward compatibility) -from bankstatements_core.extraction.row_classification_facade import ( # noqa: E402, F401 - _looks_like_date, - calculate_row_completeness_score, - classify_row_type, +# Direct service imports — replacing the three thin facade modules +from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 + MIN_COLUMN_COVERAGE as _MIN_COLUMN_COVERAGE, + MIN_TABLE_ROWS as _MIN_TABLE_ROWS, + MIN_TRANSACTION_RATIO as _MIN_TRANSACTION_RATIO, + REQUIRE_AMOUNT_COLUMN as _REQUIRE_AMOUNT_COLUMN, + REQUIRE_DATE_COLUMN as _REQUIRE_DATE_COLUMN, + MIN_HEADER_KEYWORDS as _MIN_HEADER_KEYWORDS, + SLIDING_WINDOW_SIZE as _SLIDING_WINDOW_SIZE, +) +from bankstatements_core.extraction.row_classifiers import ( # noqa: E402, F401 + create_row_classifier_chain, +) +from bankstatements_core.services.content_density import ( # noqa: E402, F401 + ContentDensityService, +) +from bankstatements_core.services.header_detection import ( # noqa: E402, F401 + HeaderDetectionService, +) +from bankstatements_core.services.page_validation import ( # noqa: E402, F401 + PageValidationService, +) +from bankstatements_core.services.row_merger import ( # noqa: E402, F401 + RowMergerService, ) -# Re-export validation functions (backward compatibility) -from bankstatements_core.extraction.validation_facade import ( # noqa: E402, F401 - calculate_column_coverage, - detect_table_headers, - has_column_type, - merge_continuation_lines, - validate_page_structure, +# Module-level singletons (instantiated once, not per-call) +_PAGE_VALIDATION_SERVICE = PageValidationService( + min_table_rows=_MIN_TABLE_ROWS, + min_column_coverage=_MIN_COLUMN_COVERAGE, + min_transaction_ratio=_MIN_TRANSACTION_RATIO, + require_date_column=_REQUIRE_DATE_COLUMN, + require_amount_column=_REQUIRE_AMOUNT_COLUMN, ) +_ROW_CLASSIFIER_CHAIN = create_row_classifier_chain() +_HEADER_SERVICE = HeaderDetectionService() +_ROW_MERGER_SERVICE = RowMergerService() +_CONTENT_DENSITY_SERVICE = ContentDensityService() + + +# Wrapper functions (backward compatibility) +def validate_page_structure( + rows: list, columns: dict +) -> bool: + """Validate page structure (backward compatibility wrapper).""" + return _PAGE_VALIDATION_SERVICE.validate_page_structure(rows, columns) + + +def calculate_column_coverage( + rows: list, columns: dict +) -> float: + """Calculate column coverage (backward compatibility wrapper).""" + return _PAGE_VALIDATION_SERVICE.calculate_column_coverage(rows, columns) + + +def has_column_type( + columns: dict, + required_types, +) -> bool: + """Check if columns contain required types (backward compatibility wrapper).""" + return _PAGE_VALIDATION_SERVICE.has_column_type(columns, required_types) + + +def detect_table_headers( + words: list, columns: dict +) -> bool: + """Detect table headers (backward compatibility wrapper).""" + return _HEADER_SERVICE.detect_headers( + words, columns, min_keywords=_MIN_HEADER_KEYWORDS + ) + + +def merge_continuation_lines( + rows: list, columns: dict +) -> list: + """Merge continuation lines (backward compatibility wrapper).""" + return _ROW_MERGER_SERVICE.merge_continuation_lines(rows, columns) + + +def classify_row_type( + row: dict, columns: dict +) -> str: + """Classify row type (backward compatibility wrapper).""" + return _ROW_CLASSIFIER_CHAIN.classify(row, columns) + + +def analyze_content_density( + word_groups: dict, + columns: dict, + window_size: int = _SLIDING_WINDOW_SIZE, +) -> list: + """Calculate transaction density in sliding windows (backward compat wrapper). + + Note: per-call construction retained because window_size varies per caller. + """ + return ContentDensityService(window_size=window_size).analyze_content_density( + word_groups, columns + ) + + +# Implementation-detail helpers kept importable for legacy callers but removed +# from the explicit public list. +def _looks_like_date(text: str) -> bool: + """Check if text looks like a valid date (backward compatibility wrapper).""" + from bankstatements_core.services.row_analysis import RowAnalysisService + + service = RowAnalysisService() + return service.looks_like_date(text) + + +def calculate_row_completeness_score( + row: dict, columns: dict +) -> float: + """Score row completeness (backward compatibility wrapper).""" + from bankstatements_core.services.row_analysis import RowAnalysisService + + service = RowAnalysisService() + return service.calculate_row_completeness_score(row, columns) + # Explicitly list all public exports for backward compatibility __all__ = [ @@ -106,8 +202,6 @@ "detect_table_end_boundary_smart", # Row classification "classify_row_type", - "_looks_like_date", - "calculate_row_completeness_score", # Content analysis "analyze_content_density", # Validation From d29414629615a29a96ab39620f2e72ed55097c08 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 18:57:02 +0000 Subject: [PATCH 3/7] test(20-02): add facade-deletion guard and singleton reuse test - Add test_facade_modules_deleted to test_architecture.py asserting ModuleNotFoundError - Add test_shim_singletons.py verifying module-level singleton identity - Update test_no_production_shim_imports docstring/error message to reflect new import targets --- .../parser-core/tests/test_architecture.py | 24 ++++++++++++---- .../parser-core/tests/test_shim_singletons.py | 28 +++++++++++++++++++ 2 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 packages/parser-core/tests/test_shim_singletons.py diff --git a/packages/parser-core/tests/test_architecture.py b/packages/parser-core/tests/test_architecture.py index 1f86240..3fe47d1 100644 --- a/packages/parser-core/tests/test_architecture.py +++ b/packages/parser-core/tests/test_architecture.py @@ -6,19 +6,20 @@ from __future__ import annotations +import importlib import re from pathlib import Path +import pytest + def test_no_production_shim_imports(): """Production source must not import from the pdf_table_extractor shim. bankstatements_core.pdf_table_extractor is a backward-compatibility shim for external callers only. Internal production code must import directly - from the real facades: - - bankstatements_core.extraction.extraction_facade - - bankstatements_core.extraction.validation_facade - - bankstatements_core.extraction.row_classification_facade + from bankstatements_core.extraction.extraction_facade or + bankstatements_core.services instead. """ src_root = Path(__file__).parent.parent / "src" pattern = re.compile( @@ -39,7 +40,18 @@ def test_no_production_shim_imports(): assert not violations, ( "Production source imports from deprecated shim " "(bankstatements_core.pdf_table_extractor).\n" - "Use bankstatements_core.extraction.extraction_facade, " - "validation_facade, or row_classification_facade instead.\n\n" + "Use bankstatements_core.extraction.extraction_facade or " + "bankstatements_core.services instead.\n\n" "Violations:\n" + "\n".join(violations) ) + + +def test_facade_modules_deleted(): + """Confirm the three thin facade pass-throughs are gone.""" + for module in [ + "bankstatements_core.extraction.content_analysis_facade", + "bankstatements_core.extraction.validation_facade", + "bankstatements_core.extraction.row_classification_facade", + ]: + with pytest.raises(ModuleNotFoundError): + importlib.import_module(module) diff --git a/packages/parser-core/tests/test_shim_singletons.py b/packages/parser-core/tests/test_shim_singletons.py new file mode 100644 index 0000000..e57d5ae --- /dev/null +++ b/packages/parser-core/tests/test_shim_singletons.py @@ -0,0 +1,28 @@ +"""Tests confirming pdf_table_extractor shim uses module-level singletons.""" + +from __future__ import annotations + +import warnings + + +def test_validate_page_structure_reuses_singleton(): + """validate_page_structure calls reuse the same PageValidationService instance.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + import bankstatements_core.pdf_table_extractor as shim + s1 = shim._PAGE_VALIDATION_SERVICE + s2 = shim._PAGE_VALIDATION_SERVICE + assert s1 is s2 + + +def test_all_singletons_are_module_level(): + """All five module-level singletons exist and are stable references.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + import bankstatements_core.pdf_table_extractor as shim + + assert shim._PAGE_VALIDATION_SERVICE is shim._PAGE_VALIDATION_SERVICE + assert shim._ROW_CLASSIFIER_CHAIN is shim._ROW_CLASSIFIER_CHAIN + assert shim._HEADER_SERVICE is shim._HEADER_SERVICE + assert shim._ROW_MERGER_SERVICE is shim._ROW_MERGER_SERVICE + assert shim._CONTENT_DENSITY_SERVICE is shim._CONTENT_DENSITY_SERVICE From 8ccdf25d006df3c40787d9f36cb83b033112d79e Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 20:47:38 +0000 Subject: [PATCH 4/7] style: apply black formatting to RFC #20 modified files --- .../extraction/pdf_extractor.py | 4 +++- .../pdf_table_extractor.py | 24 +++++-------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index 1900138..ac54711 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -148,7 +148,9 @@ def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: PageValidationService, ) - if not PageValidationService().validate_page_structure(page_rows, self.columns): + if not PageValidationService().validate_page_structure( + page_rows, self.columns + ): logger.info( f"Page {page_num}: Invalid table structure detected, " f"skipping {len(page_rows)} rows" diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index 10418c8..b2235d8 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -97,16 +97,12 @@ # Wrapper functions (backward compatibility) -def validate_page_structure( - rows: list, columns: dict -) -> bool: +def validate_page_structure(rows: list, columns: dict) -> bool: """Validate page structure (backward compatibility wrapper).""" return _PAGE_VALIDATION_SERVICE.validate_page_structure(rows, columns) -def calculate_column_coverage( - rows: list, columns: dict -) -> float: +def calculate_column_coverage(rows: list, columns: dict) -> float: """Calculate column coverage (backward compatibility wrapper).""" return _PAGE_VALIDATION_SERVICE.calculate_column_coverage(rows, columns) @@ -119,25 +115,19 @@ def has_column_type( return _PAGE_VALIDATION_SERVICE.has_column_type(columns, required_types) -def detect_table_headers( - words: list, columns: dict -) -> bool: +def detect_table_headers(words: list, columns: dict) -> bool: """Detect table headers (backward compatibility wrapper).""" return _HEADER_SERVICE.detect_headers( words, columns, min_keywords=_MIN_HEADER_KEYWORDS ) -def merge_continuation_lines( - rows: list, columns: dict -) -> list: +def merge_continuation_lines(rows: list, columns: dict) -> list: """Merge continuation lines (backward compatibility wrapper).""" return _ROW_MERGER_SERVICE.merge_continuation_lines(rows, columns) -def classify_row_type( - row: dict, columns: dict -) -> str: +def classify_row_type(row: dict, columns: dict) -> str: """Classify row type (backward compatibility wrapper).""" return _ROW_CLASSIFIER_CHAIN.classify(row, columns) @@ -166,9 +156,7 @@ def _looks_like_date(text: str) -> bool: return service.looks_like_date(text) -def calculate_row_completeness_score( - row: dict, columns: dict -) -> float: +def calculate_row_completeness_score(row: dict, columns: dict) -> float: """Score row completeness (backward compatibility wrapper).""" from bankstatements_core.services.row_analysis import RowAnalysisService From 82e8418a0836aa29d4853048e31a7226bdc1271b Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 20:52:01 +0000 Subject: [PATCH 5/7] style: fix isort import order in RFC #20 modified files --- .../extraction/pdf_extractor.py | 12 ++--- .../pdf_table_extractor.py | 47 ++++++++++++------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index ac54711..b23d8e2 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -188,12 +188,12 @@ def _determine_boundaries_and_extract( all_words = initial_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.services.header_detection import ( - HeaderDetectionService, - ) from bankstatements_core.extraction.extraction_params import ( MIN_HEADER_KEYWORDS, ) + from bankstatements_core.services.header_detection import ( + HeaderDetectionService, + ) header_top = ( header_check_top_y @@ -236,12 +236,12 @@ def _determine_boundaries_and_extract( words = table_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.services.header_detection import ( - HeaderDetectionService, - ) from bankstatements_core.extraction.extraction_params import ( MIN_HEADER_KEYWORDS, ) + from bankstatements_core.services.header_detection import ( + HeaderDetectionService, + ) header_top = ( header_check_top_y diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index b2235d8..038c1c8 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -39,33 +39,50 @@ extract_tables_from_pdf, ) +# Direct service imports — replacing the three thin facade modules # Re-export extraction parameters (backward compatibility) from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 ADMINISTRATIVE_PATTERNS, CONTENT_DENSITY_THRESHOLD, ENABLE_PAGE_VALIDATION, - MIN_COLUMN_COVERAGE, - MIN_HEADER_KEYWORDS, - MIN_TABLE_ROWS, - MIN_TRANSACTION_RATIO, - MIN_TRANSACTION_SCORE, - REQUIRE_AMOUNT_COLUMN, - REQUIRE_DATE_COLUMN, - SLIDING_WINDOW_SIZE, - TABLE_BOTTOM_Y, - TABLE_TOP_Y, ) - -# Direct service imports — replacing the three thin facade modules from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 + MIN_COLUMN_COVERAGE, +) +from bankstatements_core.extraction.extraction_params import ( MIN_COLUMN_COVERAGE as _MIN_COLUMN_COVERAGE, +) +from bankstatements_core.extraction.extraction_params import MIN_HEADER_KEYWORDS +from bankstatements_core.extraction.extraction_params import ( + MIN_HEADER_KEYWORDS as _MIN_HEADER_KEYWORDS, +) +from bankstatements_core.extraction.extraction_params import MIN_TABLE_ROWS +from bankstatements_core.extraction.extraction_params import ( MIN_TABLE_ROWS as _MIN_TABLE_ROWS, +) +from bankstatements_core.extraction.extraction_params import MIN_TRANSACTION_RATIO +from bankstatements_core.extraction.extraction_params import ( MIN_TRANSACTION_RATIO as _MIN_TRANSACTION_RATIO, +) +from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 + MIN_TRANSACTION_SCORE, +) +from bankstatements_core.extraction.extraction_params import REQUIRE_AMOUNT_COLUMN +from bankstatements_core.extraction.extraction_params import ( REQUIRE_AMOUNT_COLUMN as _REQUIRE_AMOUNT_COLUMN, +) +from bankstatements_core.extraction.extraction_params import REQUIRE_DATE_COLUMN +from bankstatements_core.extraction.extraction_params import ( REQUIRE_DATE_COLUMN as _REQUIRE_DATE_COLUMN, - MIN_HEADER_KEYWORDS as _MIN_HEADER_KEYWORDS, +) +from bankstatements_core.extraction.extraction_params import SLIDING_WINDOW_SIZE +from bankstatements_core.extraction.extraction_params import ( SLIDING_WINDOW_SIZE as _SLIDING_WINDOW_SIZE, ) +from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 + TABLE_BOTTOM_Y, + TABLE_TOP_Y, +) from bankstatements_core.extraction.row_classifiers import ( # noqa: E402, F401 create_row_classifier_chain, ) @@ -78,9 +95,7 @@ from bankstatements_core.services.page_validation import ( # noqa: E402, F401 PageValidationService, ) -from bankstatements_core.services.row_merger import ( # noqa: E402, F401 - RowMergerService, -) +from bankstatements_core.services.row_merger import RowMergerService # noqa: E402, F401 # Module-level singletons (instantiated once, not per-call) _PAGE_VALIDATION_SERVICE = PageValidationService( From c24d0ffc3a3ff82fcbea6e81e714892e830e7224 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 20:56:56 +0000 Subject: [PATCH 6/7] style: consolidate extraction_params imports to fix isort instability --- .../pdf_table_extractor.py | 54 +++++-------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index 038c1c8..d31367e 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -39,47 +39,19 @@ extract_tables_from_pdf, ) -# Direct service imports — replacing the three thin facade modules # Re-export extraction parameters (backward compatibility) from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 ADMINISTRATIVE_PATTERNS, CONTENT_DENSITY_THRESHOLD, ENABLE_PAGE_VALIDATION, -) -from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 MIN_COLUMN_COVERAGE, -) -from bankstatements_core.extraction.extraction_params import ( - MIN_COLUMN_COVERAGE as _MIN_COLUMN_COVERAGE, -) -from bankstatements_core.extraction.extraction_params import MIN_HEADER_KEYWORDS -from bankstatements_core.extraction.extraction_params import ( - MIN_HEADER_KEYWORDS as _MIN_HEADER_KEYWORDS, -) -from bankstatements_core.extraction.extraction_params import MIN_TABLE_ROWS -from bankstatements_core.extraction.extraction_params import ( - MIN_TABLE_ROWS as _MIN_TABLE_ROWS, -) -from bankstatements_core.extraction.extraction_params import MIN_TRANSACTION_RATIO -from bankstatements_core.extraction.extraction_params import ( - MIN_TRANSACTION_RATIO as _MIN_TRANSACTION_RATIO, -) -from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 + MIN_HEADER_KEYWORDS, + MIN_TABLE_ROWS, + MIN_TRANSACTION_RATIO, MIN_TRANSACTION_SCORE, -) -from bankstatements_core.extraction.extraction_params import REQUIRE_AMOUNT_COLUMN -from bankstatements_core.extraction.extraction_params import ( - REQUIRE_AMOUNT_COLUMN as _REQUIRE_AMOUNT_COLUMN, -) -from bankstatements_core.extraction.extraction_params import REQUIRE_DATE_COLUMN -from bankstatements_core.extraction.extraction_params import ( - REQUIRE_DATE_COLUMN as _REQUIRE_DATE_COLUMN, -) -from bankstatements_core.extraction.extraction_params import SLIDING_WINDOW_SIZE -from bankstatements_core.extraction.extraction_params import ( - SLIDING_WINDOW_SIZE as _SLIDING_WINDOW_SIZE, -) -from bankstatements_core.extraction.extraction_params import ( # noqa: E402, F401 + REQUIRE_AMOUNT_COLUMN, + REQUIRE_DATE_COLUMN, + SLIDING_WINDOW_SIZE, TABLE_BOTTOM_Y, TABLE_TOP_Y, ) @@ -99,11 +71,11 @@ # Module-level singletons (instantiated once, not per-call) _PAGE_VALIDATION_SERVICE = PageValidationService( - min_table_rows=_MIN_TABLE_ROWS, - min_column_coverage=_MIN_COLUMN_COVERAGE, - min_transaction_ratio=_MIN_TRANSACTION_RATIO, - require_date_column=_REQUIRE_DATE_COLUMN, - require_amount_column=_REQUIRE_AMOUNT_COLUMN, + min_table_rows=MIN_TABLE_ROWS, + min_column_coverage=MIN_COLUMN_COVERAGE, + min_transaction_ratio=MIN_TRANSACTION_RATIO, + require_date_column=REQUIRE_DATE_COLUMN, + require_amount_column=REQUIRE_AMOUNT_COLUMN, ) _ROW_CLASSIFIER_CHAIN = create_row_classifier_chain() _HEADER_SERVICE = HeaderDetectionService() @@ -133,7 +105,7 @@ def has_column_type( def detect_table_headers(words: list, columns: dict) -> bool: """Detect table headers (backward compatibility wrapper).""" return _HEADER_SERVICE.detect_headers( - words, columns, min_keywords=_MIN_HEADER_KEYWORDS + words, columns, min_keywords=MIN_HEADER_KEYWORDS ) @@ -150,7 +122,7 @@ def classify_row_type(row: dict, columns: dict) -> str: def analyze_content_density( word_groups: dict, columns: dict, - window_size: int = _SLIDING_WINDOW_SIZE, + window_size: int = SLIDING_WINDOW_SIZE, ) -> list: """Calculate transaction density in sliding windows (backward compat wrapper). From caafcb668bc28744728cc8ec8780efa6e2e80991 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 21:00:20 +0000 Subject: [PATCH 7/7] fix: add missing type annotation for has_column_type required_types param --- .../parser-core/src/bankstatements_core/pdf_table_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index d31367e..3d921a9 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -96,7 +96,7 @@ def calculate_column_coverage(rows: list, columns: dict) -> float: def has_column_type( columns: dict, - required_types, + required_types: str | list[str], ) -> bool: """Check if columns contain required types (backward compatibility wrapper).""" return _PAGE_VALIDATION_SERVICE.has_column_type(columns, required_types)