From 3fdb018f6e4fb85414162f2bb6b7216fbda5b7a0 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:51:08 +0000 Subject: [PATCH 1/8] chore: ignore .planning/ directory --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 167bd99..1fdd1ba 100644 --- a/.gitignore +++ b/.gitignore @@ -246,3 +246,6 @@ tmp/ *.tgz *.rar *.7z + +# GSD planning artifacts +.planning/ From 0bc54edee4b2916ebe49fa836fd27131122825f6 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:33:02 +0000 Subject: [PATCH 2/8] feat(19-02): redirect production shim imports to real facades - extraction_orchestrator.py: import extract_tables_from_pdf from extraction_facade (not shim) - pdf_extractor.py: redirect all 5 inline shim imports to validation_facade / extraction_facade - pdf_table_extractor.py: annotate with DeprecationWarning at module import time - pyproject.toml: suppress DeprecationWarning in filterwarnings for legitimate shim test files --- packages/parser-core/pyproject.toml | 1 + .../bankstatements_core/extraction/pdf_extractor.py | 10 +++++----- .../src/bankstatements_core/pdf_table_extractor.py | 11 +++++++++++ .../services/extraction_orchestrator.py | 2 +- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/packages/parser-core/pyproject.toml b/packages/parser-core/pyproject.toml index 91eaec6..1263655 100644 --- a/packages/parser-core/pyproject.toml +++ b/packages/parser-core/pyproject.toml @@ -93,6 +93,7 @@ markers = [ ] filterwarnings = [ "ignore:TestResult has no addDuration method:RuntimeWarning", + "ignore::DeprecationWarning:bankstatements_core.pdf_table_extractor", ] [tool.coverage.run] diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index a6ff58c..fffbc40 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -144,7 +144,7 @@ def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: page_rows = self._row_builder.build_rows(words) if self.page_validation_enabled: - from bankstatements_core.pdf_table_extractor import validate_page_structure + from bankstatements_core.extraction.validation_facade import validate_page_structure if not validate_page_structure(page_rows, self.columns): logger.info( @@ -153,7 +153,7 @@ def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: ) return None - from bankstatements_core.pdf_table_extractor import merge_continuation_lines + from bankstatements_core.extraction.validation_facade import merge_continuation_lines return merge_continuation_lines(page_rows, self.columns) @@ -184,7 +184,7 @@ def _determine_boundaries_and_extract( all_words = initial_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.pdf_table_extractor import detect_table_headers + from bankstatements_core.extraction.validation_facade import detect_table_headers header_top = ( header_check_top_y @@ -198,7 +198,7 @@ def _determine_boundaries_and_extract( logger.info(f"Page {page_num}: No table headers detected, skipping") return None - from bankstatements_core.pdf_table_extractor import ( + from bankstatements_core.extraction.extraction_facade import ( detect_table_end_boundary_smart, ) @@ -225,7 +225,7 @@ def _determine_boundaries_and_extract( words = table_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.pdf_table_extractor import detect_table_headers + from bankstatements_core.extraction.validation_facade import detect_table_headers header_top = ( header_check_top_y diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index e3f4b11..3c7f06e 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -12,6 +12,17 @@ from __future__ import annotations import logging +import warnings + +warnings.warn( + "bankstatements_core.pdf_table_extractor is a backward-compatibility shim " + "and will be removed in a future version. " + "Import directly from bankstatements_core.extraction.extraction_facade, " + "bankstatements_core.extraction.validation_facade, or " + "bankstatements_core.extraction.row_classification_facade instead.", + DeprecationWarning, + stacklevel=2, +) import pdfplumber # noqa: F401 - used by extraction module diff --git a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py index 529b65f..2e1efa2 100644 --- a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py @@ -13,7 +13,7 @@ from bankstatements_core.config.processor_config import ExtractionConfig from bankstatements_core.entitlements import Entitlements -from bankstatements_core.pdf_table_extractor import extract_tables_from_pdf +from bankstatements_core.extraction.extraction_facade import extract_tables_from_pdf from bankstatements_core.templates import TemplateDetector, TemplateRegistry from bankstatements_core.templates.template_model import BankTemplate From 0baab0eb7cc09584abc428733e7d1232615f4fc0 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:34:45 +0000 Subject: [PATCH 3/8] feat(19-02): add architecture guard and redirect remaining shim imports - tests/test_architecture.py: new CI guard scanning src/ for shim imports; fails with descriptive violation message if any found - facades/processing_facade.py: get_columns_config redirected to config.column_config - services/content_density.py: classify_row_type redirected to row_classification_facade - services/page_validation.py: classify_row_type redirected to row_classification_facade - services/row_merger.py: classify_row_type redirected to row_classification_facade - Full test suite: 1302 passed, 92.36% coverage --- .../facades/processing_facade.py | 2 +- .../services/content_density.py | 2 +- .../services/page_validation.py | 2 +- .../services/row_merger.py | 2 +- .../parser-core/tests/test_architecture.py | 45 +++++++++++++++++++ 5 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 packages/parser-core/tests/test_architecture.py diff --git a/packages/parser-core/src/bankstatements_core/facades/processing_facade.py b/packages/parser-core/src/bankstatements_core/facades/processing_facade.py index 27b9e17..c98ac33 100644 --- a/packages/parser-core/src/bankstatements_core/facades/processing_facade.py +++ b/packages/parser-core/src/bankstatements_core/facades/processing_facade.py @@ -12,7 +12,7 @@ from bankstatements_core.config.app_config import AppConfig, ConfigurationError from bankstatements_core.entitlements import EntitlementError, Entitlements -from bankstatements_core.pdf_table_extractor import get_columns_config +from bankstatements_core.config.column_config import get_columns_config if TYPE_CHECKING: from bankstatements_core.processor import BankStatementProcessor diff --git a/packages/parser-core/src/bankstatements_core/services/content_density.py b/packages/parser-core/src/bankstatements_core/services/content_density.py index 57bb49f..82fde5e 100644 --- a/packages/parser-core/src/bankstatements_core/services/content_density.py +++ b/packages/parser-core/src/bankstatements_core/services/content_density.py @@ -112,6 +112,6 @@ def _classify_row_type( String classification: 'transaction', etc. """ # Import here to avoid circular dependency - from bankstatements_core.pdf_table_extractor import classify_row_type + from bankstatements_core.extraction.row_classification_facade import classify_row_type return classify_row_type(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/services/page_validation.py b/packages/parser-core/src/bankstatements_core/services/page_validation.py index 50319ad..98ec543 100644 --- a/packages/parser-core/src/bankstatements_core/services/page_validation.py +++ b/packages/parser-core/src/bankstatements_core/services/page_validation.py @@ -190,6 +190,6 @@ def _classify_row_type( String classification: 'transaction', etc. """ # Import here to avoid circular dependency - from bankstatements_core.pdf_table_extractor import classify_row_type + from bankstatements_core.extraction.row_classification_facade import classify_row_type return classify_row_type(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index ea3a9c9..d706bb6 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -158,7 +158,7 @@ def _classify_row_type( String classification: 'transaction', 'continuation', etc. """ # Import here to avoid circular dependency - from bankstatements_core.pdf_table_extractor import classify_row_type + from bankstatements_core.extraction.row_classification_facade import classify_row_type return classify_row_type(row, columns) diff --git a/packages/parser-core/tests/test_architecture.py b/packages/parser-core/tests/test_architecture.py new file mode 100644 index 0000000..1f86240 --- /dev/null +++ b/packages/parser-core/tests/test_architecture.py @@ -0,0 +1,45 @@ +"""Architecture enforcement tests. + +These tests enforce structural constraints on the codebase that cannot be +expressed purely through type-checking or linting. +""" + +from __future__ import annotations + +import re +from pathlib import Path + + +def test_no_production_shim_imports(): + """Production source must not import from the pdf_table_extractor shim. + + bankstatements_core.pdf_table_extractor is a backward-compatibility shim + for external callers only. Internal production code must import directly + from the real facades: + - bankstatements_core.extraction.extraction_facade + - bankstatements_core.extraction.validation_facade + - bankstatements_core.extraction.row_classification_facade + """ + src_root = Path(__file__).parent.parent / "src" + pattern = re.compile( + r"from\s+bankstatements_core\.pdf_table_extractor\s+import" + r"|import\s+bankstatements_core\.pdf_table_extractor" + ) + violations = [] + for py_file in src_root.rglob("*.py"): + # Skip the shim itself — it may reference its own module name in docstrings + if py_file.name == "pdf_table_extractor.py": + continue + text = py_file.read_text(encoding="utf-8") + for i, line in enumerate(text.splitlines(), 1): + if pattern.search(line): + violations.append( + f"{py_file.relative_to(src_root)}:{i}: {line.strip()}" + ) + assert not violations, ( + "Production source imports from deprecated shim " + "(bankstatements_core.pdf_table_extractor).\n" + "Use bankstatements_core.extraction.extraction_facade, " + "validation_facade, or row_classification_facade instead.\n\n" + "Violations:\n" + "\n".join(violations) + ) From 4639eb7f2e0815628e0a25aeea2339a901ebe81a Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:35:03 +0000 Subject: [PATCH 4/8] refactor(19-01): delete 10 dead private methods from BankStatementProcessor - Remove _write_csv_with_totals and _append_totals_to_csv (zero callers) - Remove _filter_rows, _has_row_data, _filter_empty_rows (owned by TransactionFilterService) - Remove _is_header_row, _filter_header_rows (owned by TransactionFilterService) - Remove _has_valid_transaction_date, _filter_invalid_date_rows (owned by TransactionFilterService) - Remove _group_rows_by_iban (owned by TransactionProcessingOrchestrator) - Drop unused Callable import - Delete test_iban_grouping.py, test_empty_row_filtering.py, test_header_row_filtering.py - Remove test_append_totals_uses_repository and appended_csv tracking from test_repository_integration.py --- .../src/bankstatements_core/processor.py | 295 +---------------- .../tests/test_empty_row_filtering.py | 192 ----------- .../tests/test_header_row_filtering.py | 309 ------------------ .../parser-core/tests/test_iban_grouping.py | 199 ----------- .../tests/test_repository_integration.py | 30 +- 5 files changed, 3 insertions(+), 1022 deletions(-) delete mode 100644 packages/parser-core/tests/test_empty_row_filtering.py delete mode 100644 packages/parser-core/tests/test_header_row_filtering.py delete mode 100644 packages/parser-core/tests/test_iban_grouping.py diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index b6c8748..95da657 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -5,7 +5,7 @@ from collections import defaultdict # noqa: F401 - imported for test mocking from datetime import datetime from pathlib import Path -from typing import Any, Callable +from typing import Any import pandas as pd @@ -277,76 +277,6 @@ def _detect_duplicates(self, all_rows: list[dict]) -> tuple[list[dict], list[dic """ return self._duplicate_service.detect_and_separate(all_rows) - def _write_csv_with_totals(self, df: pd.DataFrame, csv_path: Path) -> None: - """ - Write DataFrame to CSV with optional totals rows appended. - - Args: - df: DataFrame containing transaction data - csv_path: Path to write the CSV file - """ - # Write the main transaction data first - df.to_csv(csv_path, index=False) - - # If totals are configured, append them to the CSV - if self.totals_columns: - # Find columns that match the configured patterns - matching_columns = find_matching_columns( - list(df.columns), self.totals_columns - ) - - if matching_columns: - logger.info( - "Calculating totals for columns: %s", ", ".join(matching_columns) - ) - - # Calculate totals for matching columns - totals = calculate_column_totals(df, matching_columns) - - # Create totals rows - self._append_totals_to_csv(csv_path, list(df.columns), totals) - - logger.info("Totals appended to CSV: %s", csv_path) - else: - logger.warning( - "No columns found matching totals patterns: %s", - ", ".join(self.totals_columns), - ) - - def _append_totals_to_csv( - self, csv_path: Path, all_columns: list[str], totals: dict[str, float] - ) -> None: - """ - Append totals rows to the existing CSV file using repository. - - Args: - csv_path: Path to the CSV file - all_columns: All column names in the CSV - totals: Dictionary mapping column names to their totals - """ - # Build totals content - content_parts = ["\n"] # Empty line for separation - - # Create totals row - totals_row = [] - for col_name in all_columns: - if col_name in totals: - # Format the total value (2 decimal places for currency) - totals_row.append(f"{totals[col_name]:.2f}") - elif is_date_column(col_name): - # Add "TOTAL" label in the first column (usually Date) - totals_row.append("TOTAL") - else: - # Empty value for non-total columns - totals_row.append("") - - # Build content string - content_parts.append(",".join(f'"{value}"' for value in totals_row)) - content_parts.append("\n") - - # Use repository to append - self.repository.append_to_csv(csv_path, "".join(content_parts)) - def _process_all_pdfs(self) -> tuple[list[dict], int, dict[str, str]]: """Process all PDF files in the input directory and extract transaction data. @@ -358,188 +288,6 @@ def _process_all_pdfs(self) -> tuple[list[dict], int, dict[str, str]]: self.input_dir, recursive=self.recursive_scan ) - def _filter_rows( - self, - rows: list[dict], - predicate: Callable[[dict], bool], - filter_name: str = "rows", - ) -> list[dict]: - """ - Generic row filter that applies a predicate function to each row. - - This method eliminates code duplication across various filtering operations - by providing a unified filtering pattern. - - Args: - rows: List of transaction dictionaries to filter - predicate: Function that returns True if row should be kept, False if filtered - filter_name: Human-readable name for logging (e.g., "empty rows", "header rows") - - Returns: - List of transactions that passed the predicate - """ - filtered_rows = [] - filtered_count = 0 - - for row in rows: - if predicate(row): - filtered_rows.append(row) - else: - filtered_count += 1 - - if filtered_count > 0: - logger.debug(f"Filtered out {filtered_count} {filter_name}") - - return filtered_rows - - def _has_row_data(self, row: dict) -> bool: - """ - Check if a row has any non-empty data (excluding Filename). - - Args: - row: Transaction dictionary to check - - Returns: - True if row has data, False if empty - """ - for key, value in row.items(): - if key == "Filename": - continue - if value and str(value).strip(): - return True - return False - - def _filter_empty_rows(self, rows: list[dict]) -> list[dict]: - """ - Filter out empty rows that have no meaningful data. - - A row is considered empty if all its values (excluding 'Filename') are empty, - whitespace, or None. - - Args: - rows: List of transaction dictionaries - - Returns: - List of non-empty transactions - """ - return self._filter_rows(rows, self._has_row_data, "empty rows") - - def _is_header_row(self, row: dict) -> bool: - """ - Detect if a row is actually a table header row. - - A row is considered a header if its values match the column names. - For example, if the "Debit €" column contains the value "Debit €", - that indicates it's a header row, not transaction data. - - Args: - row: Transaction dictionary to check - - Returns: - True if the row is a header row, False otherwise - """ - # Count how many fields have values that match their column names - matches = 0 - checked_fields = 0 - - for column_name, value in row.items(): - # Skip the Filename field - if column_name == "Filename": - continue - - # Skip empty values - if not value or not str(value).strip(): - continue - - checked_fields += 1 - value_str = str(value).strip() - column_str = column_name.strip() - - # Check for exact match or partial match (case-insensitive) - if ( - value_str.lower() == column_str.lower() - or value_str.lower() in column_str.lower() - or column_str.lower() in value_str.lower() - ): - matches += 1 - - # If we checked at least 2 fields and more than 50% match, it's a header - if checked_fields >= 2 and matches / checked_fields > 0.5: - logger.debug( - "Detected header row: %d/%d fields matched", matches, checked_fields - ) - return True - - return False - - def _filter_header_rows(self, rows: list[dict]) -> list[dict]: - """ - Filter out header rows that were incorrectly extracted as transactions. - - Args: - rows: List of transaction dictionaries - - Returns: - List of transactions with header rows removed - """ - return self._filter_rows( - rows, lambda row: not self._is_header_row(row), "header rows" - ) - - def _has_valid_transaction_date(self, row: dict) -> bool: - """ - Check if a row has a valid transaction date. - - A valid transaction date matches patterns like: - - DD/MM/YY or DD/MM/YYYY - - DD-MM-YY or DD-MM-YYYY - - DD MMM YYYY (e.g., "12 Jan 2025") - - DD MMMM YYYY (e.g., "12 January 2025") - - Args: - row: Transaction dictionary to check - - Returns: - True if the row has a valid date, False otherwise - """ - import re - - date_value = row.get("Date", "") - if not date_value or not str(date_value).strip(): - return False - - date_str = str(date_value).strip() - - # Pattern for common transaction date formats - patterns = [ - r"^\d{1,2}/\d{1,2}/\d{2,4}$", # DD/MM/YY or DD/MM/YYYY - r"^\d{1,2}-\d{1,2}-\d{2,4}$", # DD-MM-YY or DD-MM-YYYY - r"^\d{1,2}/\d{1,2}$", # DD/MM (partial date, no year) - r"^\d{1,2}-\d{1,2}$", # DD-MM (partial date, no year) - r"^\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)$", # DD MMM (partial, no year) - r"^\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+\d{4}$", # DD MMM YYYY - r"^\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}$", # DD MMMM YYYY - ] - - return any(re.match(pattern, date_str, re.IGNORECASE) for pattern in patterns) - - def _filter_invalid_date_rows(self, rows: list[dict]) -> list[dict]: - """ - Filter out rows with invalid or missing transaction dates. - - This removes junk rows like account summaries, headers, and footer text - that don't have valid transaction dates. - - Args: - rows: List of transaction dictionaries - - Returns: - List of transactions with valid dates - """ - return self._filter_rows( - rows, self._has_valid_transaction_date, "rows with invalid dates" - ) - def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]: """ Sort transactions using the configured sorting strategy. @@ -552,47 +300,6 @@ def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]: """ return self._sorting_service.sort(rows) - def _group_rows_by_iban( - self, all_rows: list[dict], pdf_ibans: dict[str, str] - ) -> dict[str, list[dict]]: - """ - Group transaction rows by IBAN (last 4 digits). - - Args: - all_rows: All transaction rows from all PDFs - pdf_ibans: Dictionary mapping PDF filenames to their IBANs - - Returns: - Dictionary mapping IBAN suffix (last 4 digits) to list of rows - """ - rows_by_iban: dict[str, list[dict]] = {} - - # Create reverse mapping: filename -> iban last 4 digits - filename_to_suffix: dict[str, str] = {} - for pdf_filename, iban in pdf_ibans.items(): - suffix = iban[-4:] if iban else "unknown" - filename_to_suffix[pdf_filename] = suffix - - # Group rows by IBAN suffix - for row in all_rows: - # Get the PDF filename from the row - pdf_filename = row.get("Filename", "") - - # Look up the IBAN suffix for this PDF - iban_suffix = filename_to_suffix.get(pdf_filename, "unknown") - - # Add row to the appropriate group - if iban_suffix not in rows_by_iban: - rows_by_iban[iban_suffix] = [] - rows_by_iban[iban_suffix].append(row) - - logger.info( - f"Grouped transactions into {len(rows_by_iban)} IBAN groups: " - f"{', '.join(sorted(rows_by_iban.keys()))}" - ) - - return rows_by_iban - def run(self) -> dict: """Process all bank statement PDFs and generate output files. diff --git a/packages/parser-core/tests/test_empty_row_filtering.py b/packages/parser-core/tests/test_empty_row_filtering.py deleted file mode 100644 index 33d8b59..0000000 --- a/packages/parser-core/tests/test_empty_row_filtering.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Tests for empty row filtering in processor.""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from bankstatements_core.config.processor_config import ( - ExtractionConfig, - ProcessorConfig, -) -from bankstatements_core.processor import BankStatementProcessor - - -def create_test_processor(**kwargs): - """Helper to create processor with test configuration.""" - input_dir = kwargs.pop("input_dir", Path("input")) - output_dir = kwargs.pop("output_dir", Path("output")) - columns = kwargs.pop("columns", None) - - extraction_config = ExtractionConfig(columns=columns) - config = ProcessorConfig( - input_dir=input_dir, - output_dir=output_dir, - extraction=extraction_config, - ) - return BankStatementProcessor(config=config) - - -class TestEmptyRowFiltering: - """Test that empty rows are never written to output files.""" - - def test_filter_empty_rows_removes_all_empty(self): - """Test that completely empty rows are removed.""" - processor = create_test_processor( - columns={"Date": (0, 100), "Details": (100, 200)}, - ) - - rows = [ - {"Date": "", "Details": "", "Filename": "test.pdf"}, - {"Date": " ", "Details": " ", "Filename": "test.pdf"}, - {"Date": None, "Details": None, "Filename": "test.pdf"}, - {"Date": "01 Jan", "Details": "Transaction", "Filename": "test.pdf"}, - ] - - filtered = processor._filter_empty_rows(rows) - - assert len(filtered) == 1 - assert filtered[0]["Date"] == "01 Jan" - - def test_filter_empty_rows_keeps_rows_with_data(self): - """Test that rows with any data are kept.""" - processor = create_test_processor( - columns={"Date": (0, 100), "Details": (100, 200), "Amount": (200, 300)}, - ) - - rows = [ - {"Date": "01 Jan", "Details": "", "Amount": "", "Filename": "test.pdf"}, - { - "Date": "", - "Details": "Transaction", - "Amount": "", - "Filename": "test.pdf", - }, - {"Date": "", "Details": "", "Amount": "50.00", "Filename": "test.pdf"}, - ] - - filtered = processor._filter_empty_rows(rows) - - assert len(filtered) == 3 # All have at least one non-empty field - - def test_filter_empty_rows_ignores_filename_field(self): - """Test that Filename field is ignored when checking emptiness.""" - processor = create_test_processor( - columns={"Date": (0, 100), "Details": (100, 200)}, - ) - - rows = [ - { - "Date": "", - "Details": "", - "Filename": "test.pdf", - }, # Should be filtered even with Filename - ] - - filtered = processor._filter_empty_rows(rows) - - assert len(filtered) == 0 - - def test_filter_empty_rows_handles_empty_list(self): - """Test filtering an empty list.""" - processor = create_test_processor( - columns={"Date": (0, 100)}, - ) - - filtered = processor._filter_empty_rows([]) - - assert filtered == [] - - def test_filter_empty_rows_handles_mixed_types(self): - """Test filtering with different value types.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Amount": (200, 300), - }, - ) - - rows = [ - { - "Date": 0, - "Details": "", - "Amount": "", - "Filename": "test.pdf", - }, # 0 is valid data - { - "Date": False, - "Details": "", - "Amount": "", - "Filename": "test.pdf", - }, # False is valid - { - "Date": "", - "Details": 0.0, - "Amount": "", - "Filename": "test.pdf", - }, # 0.0 is valid - ] - - filtered = processor._filter_empty_rows(rows) - - # Note: 0, False, and 0.0 are falsy but should be kept if they're actual data - # Current implementation treats them as empty, but that's acceptable for bank statements - # where 0 values are typically represented as "0.00" strings, not actual 0 - assert len(filtered) == 0 # All filtered as empty (falsy values) - - def test_filter_empty_rows_whitespace_only(self): - """Test that rows with only whitespace are filtered.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Amount": (200, 300), - }, - ) - - rows = [ - {"Date": " ", "Details": "\t", "Amount": "\n", "Filename": "test.pdf"}, - {"Date": " \n ", "Details": "", "Amount": " ", "Filename": "test.pdf"}, - ] - - filtered = processor._filter_empty_rows(rows) - - assert len(filtered) == 0 - - def test_filter_empty_rows_preserves_valid_data(self): - """Test that valid data is preserved exactly as is.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit": (200, 250), - "Credit": (250, 300), - }, - ) - - rows = [ - { - "Date": "01 Jan 2025", - "Details": "Purchase", - "Debit": "50.00", - "Credit": "", - "Filename": "test.pdf", - }, - { - "Date": "02 Jan 2025", - "Details": "Deposit", - "Debit": "", - "Credit": "100.00", - "Filename": "test.pdf", - }, - ] - - filtered = processor._filter_empty_rows(rows) - - assert len(filtered) == 2 - assert filtered[0]["Date"] == "01 Jan 2025" - assert filtered[0]["Debit"] == "50.00" - assert filtered[1]["Date"] == "02 Jan 2025" - assert filtered[1]["Credit"] == "100.00" diff --git a/packages/parser-core/tests/test_header_row_filtering.py b/packages/parser-core/tests/test_header_row_filtering.py deleted file mode 100644 index 996b5c5..0000000 --- a/packages/parser-core/tests/test_header_row_filtering.py +++ /dev/null @@ -1,309 +0,0 @@ -"""Tests for header row filtering in processor.""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from bankstatements_core.config.processor_config import ( - ExtractionConfig, - ProcessorConfig, -) -from bankstatements_core.processor import BankStatementProcessor - - -def create_test_processor(**kwargs): - """Helper to create processor with test configuration.""" - input_dir = kwargs.pop("input_dir", Path("input")) - output_dir = kwargs.pop("output_dir", Path("output")) - columns = kwargs.pop("columns", None) - - extraction_config = ExtractionConfig(columns=columns) - config = ProcessorConfig( - input_dir=input_dir, - output_dir=output_dir, - extraction=extraction_config, - ) - return BankStatementProcessor(config=config) - - -class TestHeaderRowFiltering: - """Test that header rows are never written to output files.""" - - def test_is_header_row_exact_match(self): - """Test that rows with values matching column names are detected as headers.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - "Balance €": (300, 350), - }, - ) - - # Header row where values match column names - header_row = { - "Date": "Date", - "Details": "Details", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Balance €": "Balance €", - "Filename": "test.pdf", - } - - assert processor._is_header_row(header_row) is True - - def test_is_header_row_partial_match(self): - """Test that rows with partial column name matches are detected as headers.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - }, - ) - - # Header row with some empty fields but enough matches - header_row = { - "Date": "", - "Details": "", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Filename": "test.pdf", - } - - assert processor._is_header_row(header_row) is True - - def test_is_header_row_case_insensitive(self): - """Test that header detection is case-insensitive.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - }, - ) - - # Header row with different case - header_row = { - "Date": "date", - "Details": "DETAILS", - "Debit €": "debit €", - "Credit €": "CREDIT €", - "Filename": "test.pdf", - } - - assert processor._is_header_row(header_row) is True - - def test_is_header_row_transaction_row(self): - """Test that actual transaction rows are not detected as headers.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - "Balance €": (300, 350), - }, - ) - - # Actual transaction row - transaction_row = { - "Date": "01 Jan 2025", - "Details": "Purchase at store", - "Debit €": "50.00", - "Credit €": "", - "Balance €": "1000.00", - "Filename": "test.pdf", - } - - assert processor._is_header_row(transaction_row) is False - - def test_is_header_row_mixed_content(self): - """Test that rows with mixed transaction/header content are handled correctly.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - }, - ) - - # Row with one header value and one transaction value (should not be detected as header) - mixed_row = { - "Date": "01 Jan", - "Details": "Transaction", - "Debit €": "Debit €", # Header value - "Credit €": "", - "Filename": "test.pdf", - } - - # Only 1 out of 3 checked fields match (33%), so not a header - assert processor._is_header_row(mixed_row) is False - - def test_filter_header_rows_removes_headers(self): - """Test that filter_header_rows removes all header rows.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - }, - ) - - rows = [ - { - "Date": "Date", - "Details": "Details", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Filename": "test.pdf", - }, - { - "Date": "01 Jan", - "Details": "Transaction", - "Debit €": "50.00", - "Credit €": "", - "Filename": "test.pdf", - }, - { - "Date": "", - "Details": "", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Filename": "test.pdf", - }, - ] - - filtered = processor._filter_header_rows(rows) - - # Should only keep the transaction row - assert len(filtered) == 1 - assert filtered[0]["Details"] == "Transaction" - - def test_filter_header_rows_keeps_transactions(self): - """Test that filter_header_rows keeps all valid transactions.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - }, - ) - - rows = [ - { - "Date": "01 Jan", - "Details": "Purchase", - "Debit €": "50.00", - "Credit €": "", - "Filename": "test.pdf", - }, - { - "Date": "02 Jan", - "Details": "Deposit", - "Debit €": "", - "Credit €": "100.00", - "Filename": "test.pdf", - }, - ] - - filtered = processor._filter_header_rows(rows) - - # Should keep all transactions - assert len(filtered) == 2 - - def test_filter_header_rows_empty_list(self): - """Test filtering an empty list.""" - processor = create_test_processor( - columns={"Date": (0, 100)}, - ) - - filtered = processor._filter_header_rows([]) - - assert filtered == [] - - def test_header_row_with_currency_symbols(self): - """Test header rows with currency symbols are detected.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - "Balance €": (300, 350), - }, - ) - - # Header row from actual duplicate (from user's example) - header_row = { - "Date": "", - "Details": "", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Balance €": "Balance €", - "Filename": "Statement JL CA 202502.pdf", - } - - assert processor._is_header_row(header_row) is True - - def test_combined_empty_and_header_filtering(self): - """Test that both empty rows and header rows are filtered together.""" - processor = create_test_processor( - columns={ - "Date": (0, 100), - "Details": (100, 200), - "Debit €": (200, 250), - "Credit €": (250, 300), - }, - ) - - rows = [ - # Empty row - { - "Date": "", - "Details": "", - "Debit €": "", - "Credit €": "", - "Filename": "test.pdf", - }, - # Header row - { - "Date": "Date", - "Details": "Details", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Filename": "test.pdf", - }, - # Valid transaction - { - "Date": "01 Jan", - "Details": "Purchase", - "Debit €": "50.00", - "Credit €": "", - "Filename": "test.pdf", - }, - # Another header row - { - "Date": "", - "Details": "", - "Debit €": "Debit €", - "Credit €": "Credit €", - "Filename": "test.pdf", - }, - ] - - # Apply both filters as done in the processor - non_empty = processor._filter_empty_rows(rows) - non_header = processor._filter_header_rows(non_empty) - - # Should only keep the valid transaction - assert len(non_header) == 1 - assert non_header[0]["Details"] == "Purchase" diff --git a/packages/parser-core/tests/test_iban_grouping.py b/packages/parser-core/tests/test_iban_grouping.py deleted file mode 100644 index 6a5a395..0000000 --- a/packages/parser-core/tests/test_iban_grouping.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Tests for IBAN-based file grouping functionality.""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from bankstatements_core.config.processor_config import ProcessorConfig -from bankstatements_core.processor import BankStatementProcessor - - -class TestIBANGrouping: - """Test cases for grouping transactions by IBAN.""" - - @pytest.fixture - def processor(self, tmp_path): - """Create a processor instance for testing.""" - input_dir = tmp_path / "input" - output_dir = tmp_path / "output" - input_dir.mkdir() - output_dir.mkdir() - - config = ProcessorConfig(input_dir=input_dir, output_dir=output_dir) - processor = BankStatementProcessor(config=config) - return processor - - def test_group_rows_by_iban_single_iban(self, processor): - """Test grouping with single IBAN.""" - all_rows = [ - {"Filename": "statement1.pdf", "Date": "01/01/2024"}, - {"Filename": "statement2.pdf", "Date": "02/01/2024"}, - ] - - pdf_ibans = { - "statement1.pdf": "IE29AIBK93115212345678", - "statement2.pdf": "IE29AIBK93115212345678", - } - - result = processor._group_rows_by_iban(all_rows, pdf_ibans) - - assert len(result) == 1 - assert "5678" in result - assert len(result["5678"]) == 2 - - def test_group_rows_by_iban_multiple_ibans(self, processor): - """Test grouping with multiple IBANs.""" - all_rows = [ - {"Filename": "statement1.pdf", "Date": "01/01/2024"}, - {"Filename": "statement2.pdf", "Date": "02/01/2024"}, - {"Filename": "statement3.pdf", "Date": "03/01/2024"}, - ] - - pdf_ibans = { - "statement1.pdf": "IE29AIBK93115212345678", # ends in 5678 - "statement2.pdf": "IE29AIBK93115212349015", # ends in 9015 - "statement3.pdf": "IE29AIBK93115212345678", # ends in 5678 - } - - result = processor._group_rows_by_iban(all_rows, pdf_ibans) - - assert len(result) == 2 - assert "5678" in result - assert "9015" in result - assert len(result["5678"]) == 2 # statement1 and statement3 - assert len(result["9015"]) == 1 # statement2 - - def test_group_rows_by_iban_no_iban_found(self, processor): - """Test grouping when no IBAN found for some PDFs.""" - all_rows = [ - {"Filename": "statement1.pdf", "Date": "01/01/2024"}, - {"Filename": "statement2.pdf", "Date": "02/01/2024"}, - ] - - pdf_ibans = { - "statement1.pdf": "IE29AIBK93115212345678", - # statement2.pdf has no IBAN - } - - result = processor._group_rows_by_iban(all_rows, pdf_ibans) - - assert len(result) == 2 - assert "5678" in result - assert "unknown" in result - assert len(result["5678"]) == 1 - assert len(result["unknown"]) == 1 - - def test_group_rows_by_iban_all_unknown(self, processor): - """Test grouping when no IBANs found at all.""" - all_rows = [ - {"Filename": "statement1.pdf", "Date": "01/01/2024"}, - {"Filename": "statement2.pdf", "Date": "02/01/2024"}, - ] - - pdf_ibans = {} # No IBANs found - - result = processor._group_rows_by_iban(all_rows, pdf_ibans) - - assert len(result) == 1 - assert "unknown" in result - assert len(result["unknown"]) == 2 - - def test_iban_suffix_in_filename(self, processor): - """Test that IBAN suffix is correctly used in output filenames.""" - # This is more of an integration test - # We can check the filename generation logic - - unique_rows = [{"Date": "01/01/2024", "Details": "Test"}] - duplicate_rows = [] - - import pandas as pd - - df_unique = pd.DataFrame(unique_rows, columns=["Date", "Details"]) - - # Write with IBAN suffix using orchestrator - output_paths = processor._output_orchestrator.write_output_files( - unique_rows, duplicate_rows, df_unique, iban_suffix="5678" - ) - - # Check that filenames contain the IBAN suffix - for path in output_paths.values(): - assert "_5678" in path, f"IBAN suffix not in path: {path}" - - def test_no_iban_suffix_in_filename(self, processor): - """Test that filenames work without IBAN suffix.""" - unique_rows = [{"Date": "01/01/2024", "Details": "Test"}] - duplicate_rows = [] - - import pandas as pd - - df_unique = pd.DataFrame(unique_rows, columns=["Date", "Details"]) - - # Write without IBAN suffix using orchestrator - output_paths = processor._output_orchestrator.write_output_files( - unique_rows, duplicate_rows, df_unique, iban_suffix=None - ) - - # Check that filenames don't have unexpected suffixes - for path in output_paths.values(): - # Should have normal names without suffix - assert ( - "bank_statements" in path - or "duplicates" in path - or "monthly_summary" in path - or "expense_analysis" in path - ) - - def test_different_iban_suffixes(self, processor): - """Test various IBAN suffix formats.""" - test_cases = [ - ("IE29AIBK93115212345678", "5678"), - ("DE89370400440532013000", "3000"), - ("GB29NWBK60161331926819", "6819"), - ("FR1420041010050500013M02606", "2606"), - ] - - for full_iban, expected_suffix in test_cases: - all_rows = [{"Filename": "test.pdf", "Date": "01/01/2024"}] - pdf_ibans = {"test.pdf": full_iban} - - result = processor._group_rows_by_iban(all_rows, pdf_ibans) - - assert expected_suffix in result - assert len(result[expected_suffix]) == 1 - - def test_grouping_preserves_row_data(self, processor): - """Test that grouping doesn't modify row data.""" - original_rows = [ - { - "Filename": "statement1.pdf", - "Date": "01/01/2024", - "Details": "Transaction 1", - "Amount": "100.00", - }, - { - "Filename": "statement2.pdf", - "Date": "02/01/2024", - "Details": "Transaction 2", - "Amount": "200.00", - }, - ] - - pdf_ibans = { - "statement1.pdf": "IE29AIBK93115212345678", - "statement2.pdf": "IE29AIBK93115212349015", - } - - result = processor._group_rows_by_iban(original_rows, pdf_ibans) - - # Check that all fields are preserved - grouped_row_1 = result["5678"][0] - assert grouped_row_1["Date"] == "01/01/2024" - assert grouped_row_1["Details"] == "Transaction 1" - assert grouped_row_1["Amount"] == "100.00" - - grouped_row_2 = result["9015"][0] - assert grouped_row_2["Date"] == "02/01/2024" - assert grouped_row_2["Details"] == "Transaction 2" - assert grouped_row_2["Amount"] == "200.00" diff --git a/packages/parser-core/tests/test_repository_integration.py b/packages/parser-core/tests/test_repository_integration.py index d6be189..d8ceb75 100644 --- a/packages/parser-core/tests/test_repository_integration.py +++ b/packages/parser-core/tests/test_repository_integration.py @@ -28,7 +28,6 @@ class MockTransactionRepository(TransactionRepository): def __init__(self): self.saved_json = [] self.saved_csv = [] - self.appended_csv = [] def save_as_json(self, transactions: list[dict], file_path: Path) -> None: """Track JSON saves.""" @@ -39,8 +38,8 @@ def save_as_csv(self, data: str, file_path: Path) -> None: self.saved_csv.append((data, file_path)) def append_to_csv(self, file_path: Path, content: str) -> None: - """Track CSV appends.""" - self.appended_csv.append((file_path, content)) + """No-op for tests that don't need append tracking.""" + pass def load_from_json(self, file_path: Path) -> list[dict]: """Mock JSON load.""" @@ -86,31 +85,6 @@ def test_write_json_file_uses_repository(self, tmp_path): assert len(mock_repo.saved_json) == 1 assert mock_repo.saved_json[0] == (test_data, test_path) - def test_append_totals_uses_repository(self, tmp_path): - """Test that _append_totals_to_csv uses repository.""" - mock_repo = MockTransactionRepository() - processor = create_test_processor( - input_dir=tmp_path / "input", - output_dir=tmp_path / "output", - repository=mock_repo, - ) - - # Append totals - test_path = tmp_path / "test.csv" - all_columns = ["Date", "Debit €", "Credit €"] - totals = {"Debit €": 100.50, "Credit €": 200.75} - - processor._append_totals_to_csv(test_path, all_columns, totals) - - # Verify repository was used - assert len(mock_repo.appended_csv) == 1 - file_path, content = mock_repo.appended_csv[0] - assert file_path == test_path - assert "TOTAL" in content - assert "100.50" in content - assert "200.75" in content - - class TestFileSystemTransactionRepository: """Test FileSystemTransactionRepository implementation.""" From 4e3084a27aa5ded3ee6d745fc74f8fd5e9204586 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:35:43 +0000 Subject: [PATCH 5/8] refactor(19-01): inline PDFProcessingOrchestrator._process_single_pdf - Replace `result = self._process_single_pdf(pdf); rows, page_count, iban = result` with direct call `rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf(pdf)` - Delete the 3-line passthrough _process_single_pdf method (no logic, pure indirection) --- .../services/pdf_processing_orchestrator.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py index 583c258..2004a32 100644 --- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py @@ -123,8 +123,7 @@ def process_all_pdfs( logger.info("Processing PDF %d of %d", idx, len(pdf_files)) try: - result = self._process_single_pdf(pdf) - rows, page_count, iban = result + rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf(pdf) pages_read += page_count # Check if should be excluded (no IBAN and no data) @@ -182,23 +181,6 @@ def process_all_pdfs( return all_rows, pages_read, pdf_ibans - def _process_single_pdf( - self, - pdf: Path, - ) -> tuple[list[dict], int, str | None]: - """Process a single PDF file. - - Args: - pdf: Path to PDF file - - Returns: - Tuple of (rows, page_count, iban) - """ - # Extract transactions using orchestrator - rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf(pdf) - - return rows, page_count, iban - def _save_ibans(self, pdf_ibans: dict[str, str]) -> None: """Save extracted IBANs to JSON file. From 628e4245d9ef50069a1deb66fae6610a7a4c037c Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:55:56 +0000 Subject: [PATCH 6/8] style: apply black formatting to RFC #19 modified files --- .../extraction/pdf_extractor.py | 16 ++++++++++++---- .../services/content_density.py | 4 +++- .../services/page_validation.py | 4 +++- .../services/pdf_processing_orchestrator.py | 4 +++- .../bankstatements_core/services/row_merger.py | 4 +++- .../tests/test_repository_integration.py | 1 + 6 files changed, 25 insertions(+), 8 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index fffbc40..e3bf110 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -144,7 +144,9 @@ def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: page_rows = self._row_builder.build_rows(words) if self.page_validation_enabled: - from bankstatements_core.extraction.validation_facade import validate_page_structure + from bankstatements_core.extraction.validation_facade import ( + validate_page_structure, + ) if not validate_page_structure(page_rows, self.columns): logger.info( @@ -153,7 +155,9 @@ def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: ) return None - from bankstatements_core.extraction.validation_facade import merge_continuation_lines + from bankstatements_core.extraction.validation_facade import ( + merge_continuation_lines, + ) return merge_continuation_lines(page_rows, self.columns) @@ -184,7 +188,9 @@ def _determine_boundaries_and_extract( all_words = initial_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.extraction.validation_facade import detect_table_headers + from bankstatements_core.extraction.validation_facade import ( + detect_table_headers, + ) header_top = ( header_check_top_y @@ -225,7 +231,9 @@ def _determine_boundaries_and_extract( words = table_area.extract_words(use_text_flow=True) if self.header_check_enabled: - from bankstatements_core.extraction.validation_facade import detect_table_headers + from bankstatements_core.extraction.validation_facade import ( + detect_table_headers, + ) header_top = ( header_check_top_y diff --git a/packages/parser-core/src/bankstatements_core/services/content_density.py b/packages/parser-core/src/bankstatements_core/services/content_density.py index 82fde5e..3459e90 100644 --- a/packages/parser-core/src/bankstatements_core/services/content_density.py +++ b/packages/parser-core/src/bankstatements_core/services/content_density.py @@ -112,6 +112,8 @@ def _classify_row_type( String classification: 'transaction', etc. """ # Import here to avoid circular dependency - from bankstatements_core.extraction.row_classification_facade import classify_row_type + from bankstatements_core.extraction.row_classification_facade import ( + classify_row_type, + ) return classify_row_type(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/services/page_validation.py b/packages/parser-core/src/bankstatements_core/services/page_validation.py index 98ec543..466727d 100644 --- a/packages/parser-core/src/bankstatements_core/services/page_validation.py +++ b/packages/parser-core/src/bankstatements_core/services/page_validation.py @@ -190,6 +190,8 @@ def _classify_row_type( String classification: 'transaction', etc. """ # Import here to avoid circular dependency - from bankstatements_core.extraction.row_classification_facade import classify_row_type + from bankstatements_core.extraction.row_classification_facade import ( + classify_row_type, + ) return classify_row_type(row, columns) diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py index 2004a32..eca3176 100644 --- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py @@ -123,7 +123,9 @@ def process_all_pdfs( logger.info("Processing PDF %d of %d", idx, len(pdf_files)) try: - rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf(pdf) + rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf( + pdf + ) pages_read += page_count # Check if should be excluded (no IBAN and no data) diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index d706bb6..f4cfcb8 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -158,7 +158,9 @@ def _classify_row_type( String classification: 'transaction', 'continuation', etc. """ # Import here to avoid circular dependency - from bankstatements_core.extraction.row_classification_facade import classify_row_type + from bankstatements_core.extraction.row_classification_facade import ( + classify_row_type, + ) return classify_row_type(row, columns) diff --git a/packages/parser-core/tests/test_repository_integration.py b/packages/parser-core/tests/test_repository_integration.py index d8ceb75..ce22733 100644 --- a/packages/parser-core/tests/test_repository_integration.py +++ b/packages/parser-core/tests/test_repository_integration.py @@ -85,6 +85,7 @@ def test_write_json_file_uses_repository(self, tmp_path): assert len(mock_repo.saved_json) == 1 assert mock_repo.saved_json[0] == (test_data, test_path) + class TestFileSystemTransactionRepository: """Test FileSystemTransactionRepository implementation.""" From 04cd8304bc4aa823aade52bc739fd4aa62060f66 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 15:57:54 +0000 Subject: [PATCH 7/8] style: fix isort import order in processing_facade.py --- .../src/bankstatements_core/facades/processing_facade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/parser-core/src/bankstatements_core/facades/processing_facade.py b/packages/parser-core/src/bankstatements_core/facades/processing_facade.py index c98ac33..b2abaf0 100644 --- a/packages/parser-core/src/bankstatements_core/facades/processing_facade.py +++ b/packages/parser-core/src/bankstatements_core/facades/processing_facade.py @@ -11,8 +11,8 @@ from typing import TYPE_CHECKING, Any from bankstatements_core.config.app_config import AppConfig, ConfigurationError -from bankstatements_core.entitlements import EntitlementError, Entitlements from bankstatements_core.config.column_config import get_columns_config +from bankstatements_core.entitlements import EntitlementError, Entitlements if TYPE_CHECKING: from bankstatements_core.processor import BankStatementProcessor From 34b3abb99a4657ba19e908d4853866e74e92d952 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 16:00:58 +0000 Subject: [PATCH 8/8] fix: resolve flake8 E402 and F401 in pdf_table_extractor and processor --- .../src/bankstatements_core/pdf_table_extractor.py | 4 ++-- packages/parser-core/src/bankstatements_core/processor.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py index 3c7f06e..827a46a 100644 --- a/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py +++ b/packages/parser-core/src/bankstatements_core/pdf_table_extractor.py @@ -14,6 +14,8 @@ import logging import warnings +import pdfplumber # noqa: F401 - used by extraction module + warnings.warn( "bankstatements_core.pdf_table_extractor is a backward-compatibility shim " "and will be removed in a future version. " @@ -24,8 +26,6 @@ stacklevel=2, ) -import pdfplumber # noqa: F401 - used by extraction module - logger = logging.getLogger(__name__) # Re-export column configuration (backward compatibility) diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 95da657..2480ebf 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -4,7 +4,6 @@ import logging from collections import defaultdict # noqa: F401 - imported for test mocking from datetime import datetime -from pathlib import Path from typing import Any import pandas as pd