From 0b25121341dd0f57c2b474f53a354a14ffb55896 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 14:49:38 +0000 Subject: [PATCH 1/5] =?UTF-8?q?feat(#28):=20add=20ServiceRegistry=20and=20?= =?UTF-8?q?wire=20processor=20=E2=80=94=20enrichment/classify/dedup/sort?= =?UTF-8?q?=20pipeline=20now=20centralised=20in=20ServiceRegistry;=20Trans?= =?UTF-8?q?actionProcessingOrchestrator=20trimmed=20to=20group=5Fby=5Fiban?= =?UTF-8?q?=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/bankstatements_core/processor.py | 25 +- .../services/service_registry.py | 252 ++++++++++++++++++ .../transaction_processing_orchestrator.py | 156 +---------- 3 files changed, 279 insertions(+), 154 deletions(-) create mode 100644 packages/parser-core/src/bankstatements_core/services/service_registry.py diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 7a3a9b5..5ee3358 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -28,6 +28,7 @@ TransactionSortingService, ) from bankstatements_core.services.transaction_filter import TransactionFilterService +from bankstatements_core.services.service_registry import ServiceRegistry from bankstatements_core.services.transaction_processing_orchestrator import ( TransactionProcessingOrchestrator, ) @@ -122,6 +123,7 @@ def __init__( activity_log: Any | None = None, entitlements: Any | None = None, template_registry: Any | None = None, + registry: ServiceRegistry | None = None, ): """ Initialize the bank statement processor. @@ -250,6 +252,17 @@ def __init__( sorting_service=self._sorting_service, ) + # ServiceRegistry: single wiring point for transaction processing + if registry is not None: + self._registry = registry + else: + self._registry = ServiceRegistry.from_config( + config, + entitlements=entitlements, + duplicate_detector=self._duplicate_service, + sorting_service=self._sorting_service, + ) + self._output_orchestrator = OutputOrchestrator( output_dir=self.output_dir, output_strategies=self.output_strategies, @@ -322,8 +335,8 @@ def run(self) -> dict: pdf_ibans[extraction.source_file.name] = extraction.iban all_rows.extend(transactions_to_dicts(extraction.transactions)) - # Step 2: Group transactions by IBAN (delegated to orchestrator) - rows_by_iban = self._transaction_orchestrator.group_by_iban(all_rows, pdf_ibans) + # Step 2: Group transactions by IBAN (delegated to registry) + rows_by_iban = self._registry.group_by_iban(all_rows, pdf_ibans) logger.debug( f"Grouped {len(all_rows)} transactions into {len(rows_by_iban)} IBAN groups" ) @@ -402,11 +415,9 @@ def _process_transaction_group( f"Using template '{template_id}' for transaction type classification" ) - # Detect duplicates and sort (delegated to orchestrator) - unique_rows, duplicate_rows = ( - self._transaction_orchestrator.process_transaction_group( - iban_rows, template=template - ) + # Detect duplicates and sort (delegated to registry) + unique_rows, duplicate_rows = self._registry.process_transaction_group( + iban_rows, template=template ) # Filter duplicates to remove any empty rows and header rows diff --git a/packages/parser-core/src/bankstatements_core/services/service_registry.py b/packages/parser-core/src/bankstatements_core/services/service_registry.py new file mode 100644 index 0000000..79cf9f3 --- /dev/null +++ b/packages/parser-core/src/bankstatements_core/services/service_registry.py @@ -0,0 +1,252 @@ +"""ServiceRegistry — single wiring point for transaction processing services. + +Centralises construction of duplicate detection, sorting, IBAN grouping, and +the enrichment/classification pipeline that was previously spread across +TransactionProcessingOrchestrator and BankStatementProcessor. + +Usage (primary path):: + + registry = ServiceRegistry.from_config(processor_config, entitlements) + unique, dupes = registry.process_transaction_group(rows, template) + grouped = registry.group_by_iban(rows, pdf_ibans) + +Escape hatches are available for callers that need individual services:: + + detector = registry.get_duplicate_detector() +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from bankstatements_core.config.processor_config import ProcessorConfig + from bankstatements_core.domain.protocols.services import ( + IDuplicateDetector, + IIBANGrouping, + ITransactionSorting, + ) + from bankstatements_core.entitlements import Entitlements + from bankstatements_core.templates.template_model import BankTemplate + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class _ServiceContext: + """Shared dependencies passed once to ServiceRegistry at construction time. + + This is an internal dataclass — never exposed to callers. + """ + + column_names: list[str] + debit_columns: list[str] + credit_columns: list[str] + entitlements: Any # Entitlements | None + + +class ServiceRegistry: + """Single wiring point for all transaction processing services. + + Callers use the primary methods for the common case. + Individual services are accessible via get_*() escape hatches for tests + or specialised callers. + """ + + def __init__( + self, + context: _ServiceContext, + duplicate_detector: "IDuplicateDetector", + sorting_service: "ITransactionSorting", + grouping_service: "IIBANGrouping", + ) -> None: + self._context = context + self._duplicate_detector = duplicate_detector + self._sorting_service = sorting_service + self._grouping_service = grouping_service + + # ------------------------------------------------------------------ + # Factory + # ------------------------------------------------------------------ + + @classmethod + def from_config( + cls, + config: "ProcessorConfig", + entitlements: "Entitlements | None" = None, + duplicate_detector: "IDuplicateDetector | None" = None, + sorting_service: "ITransactionSorting | None" = None, + grouping_service: "IIBANGrouping | None" = None, + ) -> "ServiceRegistry": + """Build a ServiceRegistry from a ProcessorConfig. + + Args: + config: Processor configuration carrying column, sorting, and + processing settings. + entitlements: Optional tier-based entitlements. + duplicate_detector: Override duplicate detector (default: AllFields). + sorting_service: Override sorting service (default: chronological + if config.processing.sort_by_date, else no-sort). + grouping_service: Override IBAN grouping service (default: suffix-4). + + Returns: + Fully wired ServiceRegistry instance. + """ + from bankstatements_core.config.column_config import get_column_names + from bankstatements_core.patterns.strategies import ( + AllFieldsDuplicateStrategy, + ChronologicalSortingStrategy, + NoSortingStrategy, + ) + from bankstatements_core.processor import find_matching_columns + from bankstatements_core.services.duplicate_detector import ( + DuplicateDetectionService, + ) + from bankstatements_core.services.iban_grouping import IBANGroupingService + from bankstatements_core.services.sorting_service import ( + TransactionSortingService, + ) + + column_names = get_column_names( + config.extraction.columns + ) if config.extraction.columns else [] + debit_columns = find_matching_columns(column_names, ["debit"]) + credit_columns = find_matching_columns(column_names, ["credit"]) + + context = _ServiceContext( + column_names=column_names, + debit_columns=debit_columns, + credit_columns=credit_columns, + entitlements=entitlements, + ) + + if duplicate_detector is None: + duplicate_detector = DuplicateDetectionService(AllFieldsDuplicateStrategy()) + + if sorting_service is None: + sort_strategy = ( + ChronologicalSortingStrategy() + if config.processing.sort_by_date + else NoSortingStrategy() + ) + sorting_service = TransactionSortingService(sort_strategy) + + if grouping_service is None: + grouping_service = IBANGroupingService() + + return cls(context, duplicate_detector, sorting_service, grouping_service) + + # ------------------------------------------------------------------ + # Primary methods (80 % case) + # ------------------------------------------------------------------ + + def process_transaction_group( + self, + transactions: list[dict], + template: "BankTemplate | None" = None, + ) -> tuple[list[dict], list[dict]]: + """Enrich → classify → deduplicate → sort a group of transactions. + + This replaces the explicit five-call chain that was previously spread + across BankStatementProcessor and TransactionProcessingOrchestrator. + + Args: + transactions: List of transaction dicts for a single IBAN group. + template: Optional bank template used for transaction type keywords. + + Returns: + Tuple of (unique_transactions, duplicate_transactions). + """ + enriched = self._enrich_with_filename(transactions) + enriched = self._enrich_with_document_type(enriched) + enriched = self._classify_transaction_types(enriched, template) + + unique_rows, duplicate_rows = self._duplicate_detector.detect_and_separate( + enriched + ) + logger.info( + "Duplicate detection: %d unique, %d duplicates", + len(unique_rows), + len(duplicate_rows), + ) + + sorted_rows = self._sorting_service.sort(unique_rows) + return sorted_rows, duplicate_rows + + def group_by_iban( + self, + transactions: list[dict], + pdf_ibans: dict[str, str], + ) -> dict[str, list[dict]]: + """Group transactions by IBAN suffix. + + Args: + transactions: Flat list of all transaction dicts. + pdf_ibans: Mapping of PDF filename → IBAN string. + + Returns: + Dict of IBAN suffix → list of transaction dicts. + """ + return self._grouping_service.group_by_iban(transactions, pdf_ibans) + + # ------------------------------------------------------------------ + # Escape hatches (20 % case) + # ------------------------------------------------------------------ + + def get_duplicate_detector(self) -> "IDuplicateDetector": + return self._duplicate_detector + + def get_sorting_service(self) -> "ITransactionSorting": + return self._sorting_service + + def get_grouping_service(self) -> "IIBANGrouping": + return self._grouping_service + + # ------------------------------------------------------------------ + # Internal enrichment helpers (inlined from TransactionProcessingOrchestrator) + # ------------------------------------------------------------------ + + @staticmethod + def _enrich_with_filename(transactions: list[dict]) -> list[dict]: + """Set Filename key from source_pdf if not already present.""" + for row in transactions: + if "Filename" not in row: + row["Filename"] = row.get("source_pdf", "") + return transactions + + @staticmethod + def _enrich_with_document_type( + transactions: list[dict], default_type: str = "bank_statement" + ) -> list[dict]: + """Set document_type if not already present.""" + for row in transactions: + if "document_type" not in row: + row["document_type"] = default_type + return transactions + + @staticmethod + def _classify_transaction_types( + transactions: list[dict], + template: "BankTemplate | None" = None, + ) -> list[dict]: + """Classify each transaction using Chain of Responsibility.""" + from bankstatements_core.services.transaction_type_classifier import ( + create_transaction_type_classifier_chain, + ) + + if not transactions: + return transactions + + document_type = transactions[0].get("document_type") + classifier = create_transaction_type_classifier_chain(document_type) + + for transaction in transactions: + transaction["transaction_type"] = classifier.classify(transaction, template) + + logger.info( + "Transaction type classification: %d transactions classified", + len(transactions), + ) + return transactions diff --git a/packages/parser-core/src/bankstatements_core/services/transaction_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/transaction_processing_orchestrator.py index a5c2ed1..077bb25 100644 --- a/packages/parser-core/src/bankstatements_core/services/transaction_processing_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/transaction_processing_orchestrator.py @@ -1,10 +1,10 @@ """Transaction Processing Orchestrator for bank statements. -This module orchestrates transaction-level processing including: -- IBAN grouping -- Duplicate detection -- Sorting -- Filename enrichment +This module orchestrates IBAN grouping. Enrichment, classification, duplicate +detection and sorting have moved to ServiceRegistry. + +Note: This class is retained for backward compatibility. A follow-up issue will +track its complete removal once all callers migrate to ServiceRegistry. """ from __future__ import annotations @@ -18,19 +18,16 @@ IIBANGrouping, ITransactionSorting, ) - from bankstatements_core.templates.template_model import BankTemplate logger = logging.getLogger(__name__) class TransactionProcessingOrchestrator: - """Orchestrates transaction processing pipeline. + """Orchestrates IBAN grouping for transaction processing. - Handles: - - Grouping transactions by IBAN - - Duplicate detection and removal - - Transaction sorting (chronological or none) - - Enrichment with source filename + Note: enrichment and classification logic has moved to ServiceRegistry. + This class is retained for backward compatibility and will be removed in a + follow-up. """ def __init__( @@ -63,140 +60,5 @@ def group_by_iban( Returns: Dictionary mapping IBANs to their transactions - - Examples: - >>> orchestrator = TransactionProcessingOrchestrator(detector, sorter) - >>> grouped = orchestrator.group_by_iban(transactions, ibans) - >>> for iban, txns in grouped.items(): - ... print(f"IBAN {iban}: {len(txns)} transactions") """ return self.grouping_service.group_by_iban(transactions, pdf_ibans) - - def process_transaction_group( - self, transactions: list[dict], template: "BankTemplate | None" = None - ) -> tuple[list[dict], list[dict]]: - """Process a group of transactions (detect duplicates, sort, enrich). - - Args: - transactions: List of transaction dictionaries - template: Optional bank template with transaction type keywords - - Returns: - Tuple of (unique_transactions, duplicate_transactions) - - Examples: - >>> orchestrator = TransactionProcessingOrchestrator(detector, sorter) - >>> unique, dupes = orchestrator.process_transaction_group(transactions) - >>> total = len(unique) + len(dupes) - >>> print(f"Found {len(dupes)} duplicates in {total} transactions") - """ - # 0. Enrich with metadata (filename, document_type, transaction_type) - enriched = self.enrich_with_filename(transactions) - enriched = self.enrich_with_document_type(enriched) - enriched = self.classify_transaction_types(enriched, template) - - # 1. Detect duplicates (now with transaction_type available) - unique_rows, duplicate_rows = self.duplicate_detector.detect_and_separate( - enriched - ) - - logger.info( - "Duplicate detection: %d unique, %d duplicates", - len(unique_rows), - len(duplicate_rows), - ) - - # 2. Sort transactions if configured - sorted_rows = self.sorting_service.sort(unique_rows) - - return sorted_rows, duplicate_rows - - def enrich_with_filename(self, transactions: list[dict]) -> list[dict]: - """Add 'Filename' column to transactions if not present. - - Args: - transactions: List of transaction dictionaries - - Returns: - Transactions with 'Filename' column added - - Examples: - >>> orchestrator = TransactionProcessingOrchestrator(detector, sorter) - >>> enriched = orchestrator.enrich_with_filename(transactions) - >>> all('Filename' in txn for txn in enriched) - True - """ - for row in transactions: - if "Filename" not in row: - row["Filename"] = row.get("source_pdf", "") - - return transactions - - def enrich_with_document_type( - self, transactions: list[dict], default_type: str = "bank_statement" - ) -> list[dict]: - """Add 'document_type' column to transactions if not present. - - Args: - transactions: List of transaction dictionaries - default_type: Default document type if missing (default: "bank_statement") - - Returns: - Transactions with document_type field - - Examples: - >>> orchestrator = TransactionProcessingOrchestrator(detector, sorter) - >>> enriched = orchestrator.enrich_with_document_type(transactions) - >>> all('document_type' in txn for txn in enriched) - True - """ - for row in transactions: - if "document_type" not in row: - row["document_type"] = default_type - - return transactions - - def classify_transaction_types( - self, transactions: list[dict], template: "BankTemplate | None" = None - ) -> list[dict]: - """Classify transaction type for each transaction. - - Uses Chain of Responsibility pattern to apply multiple classification - strategies in sequence until one succeeds. - - Args: - transactions: List of transaction dictionaries - template: Optional bank template with transaction type keywords - - Returns: - Transactions with transaction_type field added - - Examples: - >>> orchestrator = TransactionProcessingOrchestrator(detector, sorter) - >>> enriched = orchestrator.classify_transaction_types(transactions, template) - >>> all('transaction_type' in txn for txn in enriched) - True - """ - from bankstatements_core.services.transaction_type_classifier import ( - create_transaction_type_classifier_chain, - ) - - if not transactions: - return transactions - - # Get document type from first transaction (all in group have same type) - document_type = transactions[0].get("document_type") - - # Create classifier chain appropriate for document type - classifier = create_transaction_type_classifier_chain(document_type) - - # Classify each transaction - for transaction in transactions: - transaction["transaction_type"] = classifier.classify(transaction, template) - - logger.info( - "Transaction type classification: %d transactions classified", - len(transactions), - ) - - return transactions From eba90746e0cbc3e3f45db16ab583f0ab729ef6cf Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 14:58:19 +0000 Subject: [PATCH 2/5] feat(#28): wire ServiceRegistry through factory and builder; add boundary tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ProcessorFactory.create_custom and BankStatementProcessorBuilder.build() now construct ServiceRegistry.from_config() and inject it via registry= kwarg, so the processor no longer builds its own registry internally - Fix wrong import of ChronologicalSortingStrategy (patterns.strategies → services.sorting_service) in service_registry.py - Add tests/services/test_service_registry.py with boundary tests covering from_config wiring, process_transaction_group pipeline, group_by_iban delegation, and escape hatches - Trim test_transaction_processing_orchestrator.py to group_by_iban only; removed tests for methods that moved to ServiceRegistry --- .../builders/processor_builder.py | 5 + .../bankstatements_core/patterns/factories.py | 5 + .../services/service_registry.py | 12 +- .../tests/services/test_service_registry.py | 150 ++++++++++ ...est_transaction_processing_orchestrator.py | 257 +----------------- 5 files changed, 171 insertions(+), 258 deletions(-) create mode 100644 packages/parser-core/tests/services/test_service_registry.py diff --git a/packages/parser-core/src/bankstatements_core/builders/processor_builder.py b/packages/parser-core/src/bankstatements_core/builders/processor_builder.py index 70b3cc2..d5b90b2 100644 --- a/packages/parser-core/src/bankstatements_core/builders/processor_builder.py +++ b/packages/parser-core/src/bankstatements_core/builders/processor_builder.py @@ -342,6 +342,10 @@ def build(self) -> "BankStatementProcessor": config.extraction.enable_dynamic_boundary, ) + from bankstatements_core.services.service_registry import ServiceRegistry + + registry = ServiceRegistry.from_config(config, entitlements=self._entitlements) + return BankStatementProcessor( config=config, output_strategies=self._output_strategies, @@ -349,4 +353,5 @@ def build(self) -> "BankStatementProcessor": repository=self._repository, activity_log=self._activity_log, entitlements=self._entitlements, + registry=registry, ) diff --git a/packages/parser-core/src/bankstatements_core/patterns/factories.py b/packages/parser-core/src/bankstatements_core/patterns/factories.py index a556a8f..b260b23 100644 --- a/packages/parser-core/src/bankstatements_core/patterns/factories.py +++ b/packages/parser-core/src/bankstatements_core/patterns/factories.py @@ -210,11 +210,16 @@ def create_custom( ), ) + from bankstatements_core.services.service_registry import ServiceRegistry + + registry = ServiceRegistry.from_config(config, entitlements=entitlements) + processor = BankStatementProcessor( config=config, output_strategies=output_strategies, duplicate_strategy=duplicate_strategy, entitlements=entitlements, + registry=registry, ) return processor diff --git a/packages/parser-core/src/bankstatements_core/services/service_registry.py b/packages/parser-core/src/bankstatements_core/services/service_registry.py index 79cf9f3..54d179d 100644 --- a/packages/parser-core/src/bankstatements_core/services/service_registry.py +++ b/packages/parser-core/src/bankstatements_core/services/service_registry.py @@ -97,8 +97,6 @@ def from_config( from bankstatements_core.config.column_config import get_column_names from bankstatements_core.patterns.strategies import ( AllFieldsDuplicateStrategy, - ChronologicalSortingStrategy, - NoSortingStrategy, ) from bankstatements_core.processor import find_matching_columns from bankstatements_core.services.duplicate_detector import ( @@ -106,12 +104,16 @@ def from_config( ) from bankstatements_core.services.iban_grouping import IBANGroupingService from bankstatements_core.services.sorting_service import ( + ChronologicalSortingStrategy, + NoSortingStrategy, TransactionSortingService, ) - column_names = get_column_names( - config.extraction.columns - ) if config.extraction.columns else [] + column_names = ( + get_column_names(config.extraction.columns) + if config.extraction.columns + else [] + ) debit_columns = find_matching_columns(column_names, ["debit"]) credit_columns = find_matching_columns(column_names, ["credit"]) diff --git a/packages/parser-core/tests/services/test_service_registry.py b/packages/parser-core/tests/services/test_service_registry.py new file mode 100644 index 0000000..6366167 --- /dev/null +++ b/packages/parser-core/tests/services/test_service_registry.py @@ -0,0 +1,150 @@ +"""Boundary tests for ServiceRegistry. + +Covers: +- from_config builds a fully wired registry +- process_transaction_group runs enrich → classify → dedup → sort +- group_by_iban delegates to grouping service +- escape hatches return the injected services +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from bankstatements_core.config.processor_config import ( + ExtractionConfig, + OutputConfig, + ProcessingConfig, + ProcessorConfig, +) +from bankstatements_core.services.duplicate_detector import DuplicateDetectionService +from bankstatements_core.services.iban_grouping import IBANGroupingService +from bankstatements_core.services.service_registry import ServiceRegistry +from bankstatements_core.services.sorting_service import TransactionSortingService + + +def _minimal_config(sort_by_date: bool = True) -> ProcessorConfig: + tmp = Path(tempfile.mkdtemp()) + return ProcessorConfig( + input_dir=tmp, + output_dir=tmp, + extraction=ExtractionConfig(), + processing=ProcessingConfig(sort_by_date=sort_by_date), + output=OutputConfig(), + ) + + +class TestFromConfig: + def test_builds_registry_with_default_services(self): + config = _minimal_config() + registry = ServiceRegistry.from_config(config) + assert isinstance(registry.get_duplicate_detector(), DuplicateDetectionService) + assert isinstance(registry.get_sorting_service(), TransactionSortingService) + assert isinstance(registry.get_grouping_service(), IBANGroupingService) + + def test_injected_services_override_defaults(self): + config = _minimal_config() + mock_dedup = Mock() + mock_sort = Mock() + mock_group = Mock() + registry = ServiceRegistry.from_config( + config, + duplicate_detector=mock_dedup, + sorting_service=mock_sort, + grouping_service=mock_group, + ) + assert registry.get_duplicate_detector() is mock_dedup + assert registry.get_sorting_service() is mock_sort + assert registry.get_grouping_service() is mock_group + + +class TestProcessTransactionGroup: + def test_enriches_classifies_deduplicates_and_sorts(self): + config = _minimal_config() + transactions = [ + {"Date": "01/01/2024", "Details": "Test", "source_pdf": "a.pdf"}, + ] + + mock_dedup = Mock() + mock_dedup.detect_and_separate.return_value = (transactions, []) + mock_sort = Mock() + mock_sort.sort.side_effect = lambda x: x + + registry = ServiceRegistry.from_config( + config, + duplicate_detector=mock_dedup, + sorting_service=mock_sort, + ) + + unique, dupes = registry.process_transaction_group(transactions) + + # Enrichment happened before dedup + called_with = mock_dedup.detect_and_separate.call_args[0][0] + assert called_with[0]["Filename"] == "a.pdf" + assert called_with[0]["document_type"] == "bank_statement" + assert "transaction_type" in called_with[0] + + # Sort was called and result returned + mock_sort.sort.assert_called_once() + assert unique == transactions + assert dupes == [] + + def test_returns_unique_and_duplicate_lists(self): + config = _minimal_config() + tx1 = {"Date": "01/01/2024", "Details": "A", "source_pdf": "x.pdf"} + tx2 = {"Date": "01/01/2024", "Details": "A", "source_pdf": "x.pdf"} + + mock_dedup = Mock() + mock_dedup.detect_and_separate.return_value = ([tx1], [tx2]) + mock_sort = Mock() + mock_sort.sort.side_effect = lambda x: x + + registry = ServiceRegistry.from_config( + config, + duplicate_detector=mock_dedup, + sorting_service=mock_sort, + ) + + unique, dupes = registry.process_transaction_group([tx1, tx2]) + assert len(unique) == 1 + assert len(dupes) == 1 + + +class TestGroupByIban: + def test_delegates_to_grouping_service(self): + config = _minimal_config() + mock_group = Mock() + mock_group.group_by_iban.return_value = {"1234": []} + + registry = ServiceRegistry.from_config(config, grouping_service=mock_group) + transactions = [{"Date": "01/01/2024"}] + pdf_ibans = {"a.pdf": "IE001234"} + + result = registry.group_by_iban(transactions, pdf_ibans) + + mock_group.group_by_iban.assert_called_once_with(transactions, pdf_ibans) + assert result == {"1234": []} + + +class TestEscapeHatches: + def test_get_duplicate_detector_returns_injected(self): + config = _minimal_config() + mock_dedup = Mock() + registry = ServiceRegistry.from_config(config, duplicate_detector=mock_dedup) + assert registry.get_duplicate_detector() is mock_dedup + + def test_get_sorting_service_returns_injected(self): + config = _minimal_config() + mock_sort = Mock() + registry = ServiceRegistry.from_config(config, sorting_service=mock_sort) + assert registry.get_sorting_service() is mock_sort + + def test_get_grouping_service_returns_injected(self): + config = _minimal_config() + mock_group = Mock() + registry = ServiceRegistry.from_config(config, grouping_service=mock_group) + assert registry.get_grouping_service() is mock_group diff --git a/packages/parser-core/tests/services/test_transaction_processing_orchestrator.py b/packages/parser-core/tests/services/test_transaction_processing_orchestrator.py index 6d3e233..30f8f6a 100644 --- a/packages/parser-core/tests/services/test_transaction_processing_orchestrator.py +++ b/packages/parser-core/tests/services/test_transaction_processing_orchestrator.py @@ -1,7 +1,7 @@ """Tests for TransactionProcessingOrchestrator. -This module tests the transaction processing orchestration including -duplicate detection, sorting, and metadata enrichment. +The orchestrator now only handles IBAN grouping. Enrichment, classification, +duplicate detection and sorting are tested via test_service_registry.py. """ from __future__ import annotations @@ -17,244 +17,34 @@ @pytest.fixture def mock_duplicate_detector(): - """Create a mock duplicate detector.""" detector = Mock() - detector.detect_and_separate.return_value = ([], []) # (unique, duplicates) + detector.detect_and_separate.return_value = ([], []) return detector @pytest.fixture def mock_sorting_service(): - """Create a mock sorting service.""" sorter = Mock() - sorter.sort.side_effect = lambda x: x # Pass through + sorter.sort.side_effect = lambda x: x return sorter @pytest.fixture def orchestrator(mock_duplicate_detector, mock_sorting_service): - """Create a TransactionProcessingOrchestrator instance.""" return TransactionProcessingOrchestrator( duplicate_detector=mock_duplicate_detector, sorting_service=mock_sorting_service, ) -class TestEnrichWithFilename: - """Test filename enrichment.""" - - def test_adds_filename_when_missing(self, orchestrator): - """Test that Filename is added when not present.""" - transactions = [ - {"Date": "01/12/2023", "Details": "Test", "source_pdf": "test.pdf"}, - {"Date": "02/12/2023", "Details": "Test2", "source_pdf": "test2.pdf"}, - ] - - enriched = orchestrator.enrich_with_filename(transactions) - - assert enriched[0]["Filename"] == "test.pdf" - assert enriched[1]["Filename"] == "test2.pdf" - - def test_preserves_existing_filename(self, orchestrator): - """Test that existing Filename is preserved.""" - transactions = [ - { - "Date": "01/12/2023", - "Details": "Test", - "Filename": "existing.pdf", - "source_pdf": "other.pdf", - } - ] - - enriched = orchestrator.enrich_with_filename(transactions) - - # Should keep existing Filename - assert enriched[0]["Filename"] == "existing.pdf" - - def test_handles_missing_source_pdf(self, orchestrator): - """Test handling when source_pdf is missing.""" - transactions = [{"Date": "01/12/2023", "Details": "Test"}] - - enriched = orchestrator.enrich_with_filename(transactions) - - # Should add empty Filename - assert enriched[0]["Filename"] == "" - - -class TestEnrichWithDocumentType: - """Test document type enrichment.""" - - def test_adds_document_type_when_missing(self, orchestrator): - """Test that document_type is added when not present.""" - transactions = [ - {"Date": "01/12/2023", "Details": "Test1"}, - {"Date": "02/12/2023", "Details": "Test2"}, - ] - - enriched = orchestrator.enrich_with_document_type(transactions) - - # Should add default document_type - assert enriched[0]["document_type"] == "bank_statement" - assert enriched[1]["document_type"] == "bank_statement" - - def test_preserves_existing_document_type(self, orchestrator): - """Test that existing document_type is preserved.""" - transactions = [ - { - "Date": "01/12/2023", - "Details": "Card Purchase", - "document_type": "credit_card_statement", - }, - { - "Date": "02/12/2023", - "Details": "Loan Payment", - "document_type": "loan_statement", - }, - ] - - enriched = orchestrator.enrich_with_document_type(transactions) - - # Should preserve existing types - assert enriched[0]["document_type"] == "credit_card_statement" - assert enriched[1]["document_type"] == "loan_statement" - - def test_custom_default_type(self, orchestrator): - """Test using custom default document type.""" - transactions = [{"Date": "01/12/2023", "Details": "Test"}] - - enriched = orchestrator.enrich_with_document_type( - transactions, default_type="credit_card_statement" - ) - - assert enriched[0]["document_type"] == "credit_card_statement" - - def test_mixed_existing_and_missing(self, orchestrator): - """Test handling mix of transactions with and without document_type.""" - transactions = [ - { - "Date": "01/12/2023", - "Details": "Has Type", - "document_type": "credit_card_statement", - }, - {"Date": "02/12/2023", "Details": "No Type"}, - { - "Date": "03/12/2023", - "Details": "Has Type", - "document_type": "loan_statement", - }, - ] - - enriched = orchestrator.enrich_with_document_type(transactions) - - # Should preserve existing and add default for missing - assert enriched[0]["document_type"] == "credit_card_statement" - assert enriched[1]["document_type"] == "bank_statement" # Default added - assert enriched[2]["document_type"] == "loan_statement" - - -class TestProcessTransactionGroup: - """Test transaction group processing.""" - - def test_enriches_before_duplicate_detection( - self, mock_duplicate_detector, mock_sorting_service - ): - """Test that enrichment happens before duplicate detection.""" - transactions = [ - {"Date": "01/12/2023", "Details": "Test", "source_pdf": "test.pdf"} - ] - - # Setup mock to capture what's passed to detect_and_separate - captured_input = [] - - def capture_input(txns): - captured_input.extend(txns) - return (txns, []) # Return as unique, no duplicates - - mock_duplicate_detector.detect_and_separate.side_effect = capture_input - - orchestrator = TransactionProcessingOrchestrator( - duplicate_detector=mock_duplicate_detector, - sorting_service=mock_sorting_service, - ) - - orchestrator.process_transaction_group(transactions) - - # Verify enrichment happened before duplicate detection - assert len(captured_input) > 0 - assert "Filename" in captured_input[0] - assert "document_type" in captured_input[0] - - def test_enrichment_includes_both_fields( - self, mock_duplicate_detector, mock_sorting_service - ): - """Test that both Filename and document_type are added.""" - transactions = [ - {"Date": "01/12/2023", "Details": "Test", "source_pdf": "test.pdf"} - ] - - captured_input = [] - - def capture_input(txns): - captured_input.extend(txns) - return (txns, []) - - mock_duplicate_detector.detect_and_separate.side_effect = capture_input - - orchestrator = TransactionProcessingOrchestrator( - duplicate_detector=mock_duplicate_detector, - sorting_service=mock_sorting_service, - ) - - orchestrator.process_transaction_group(transactions) - - # Both fields should be present - assert captured_input[0]["Filename"] == "test.pdf" - assert captured_input[0]["document_type"] == "bank_statement" - - def test_enrichment_preserves_existing_document_type( - self, mock_duplicate_detector, mock_sorting_service - ): - """Test that existing document_type from extraction is preserved.""" - transactions = [ - { - "Date": "01/12/2023", - "Details": "Card Purchase", - "Filename": "card.pdf", - "document_type": "credit_card_statement", # Already set by extractor - } - ] - - captured_input = [] - - def capture_input(txns): - captured_input.extend(txns) - return (txns, []) - - mock_duplicate_detector.detect_and_separate.side_effect = capture_input - - orchestrator = TransactionProcessingOrchestrator( - duplicate_detector=mock_duplicate_detector, - sorting_service=mock_sorting_service, - ) - - orchestrator.process_transaction_group(transactions) - - # Should preserve credit_card_statement, not override with default - assert captured_input[0]["document_type"] == "credit_card_statement" - - class TestGroupByIBAN: - """Test IBAN grouping.""" - def test_delegates_to_grouping_service(self, orchestrator): - """Test that group_by_iban delegates to grouping service.""" transactions = [ {"Date": "01/12/2023", "Details": "Test", "Filename": "test1.pdf"}, {"Date": "02/12/2023", "Details": "Test2", "Filename": "test2.pdf"}, ] pdf_ibans = {"test1.pdf": "IE12345", "test2.pdf": "IE67890"} - # Mock the grouping service orchestrator.grouping_service = Mock() orchestrator.grouping_service.group_by_iban.return_value = { "IE12345": [transactions[0]], @@ -263,47 +53,8 @@ def test_delegates_to_grouping_service(self, orchestrator): result = orchestrator.group_by_iban(transactions, pdf_ibans) - # Verify delegation orchestrator.grouping_service.group_by_iban.assert_called_once_with( transactions, pdf_ibans ) assert "IE12345" in result assert "IE67890" in result - - -class TestEnrichmentIntegration: - """Integration tests for enrichment in processing pipeline.""" - - def test_full_pipeline_with_document_types( - self, mock_duplicate_detector, mock_sorting_service - ): - """Test full processing pipeline preserves document types.""" - transactions = [ - { - "Date": "01/12/2023", - "Details": "Bank", - "source_pdf": "bank.pdf", - "document_type": "bank_statement", - }, - { - "Date": "02/12/2023", - "Details": "Card", - "source_pdf": "card.pdf", - "document_type": "credit_card_statement", - }, - ] - - # Setup mocks to pass through - mock_duplicate_detector.detect_and_separate.return_value = (transactions, []) - mock_sorting_service.sort.side_effect = lambda x: x - - orchestrator = TransactionProcessingOrchestrator( - duplicate_detector=mock_duplicate_detector, - sorting_service=mock_sorting_service, - ) - - unique, duplicates = orchestrator.process_transaction_group(transactions) - - # Verify document types preserved through pipeline - assert unique[0]["document_type"] == "bank_statement" - assert unique[1]["document_type"] == "credit_card_statement" From 0635820829468fe6b828274f05bce33e2053343c Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 14:59:45 +0000 Subject: [PATCH 3/5] chore: untrack logs/ dirs and broaden gitignore to **/logs/ --- .gitignore | 4 ++-- .../parser-free/logs/processing_activity.jsonl | 14 -------------- 2 files changed, 2 insertions(+), 16 deletions(-) delete mode 100644 packages/parser-free/logs/processing_activity.jsonl diff --git a/.gitignore b/.gitignore index 1fdd1ba..7b6fe98 100644 --- a/.gitignore +++ b/.gitignore @@ -121,8 +121,8 @@ output/ **/output/ # Logs -logs/*.log -logs/processing_activity.jsonl +logs/ +**/logs/ *.log # Development Artifacts diff --git a/packages/parser-free/logs/processing_activity.jsonl b/packages/parser-free/logs/processing_activity.jsonl deleted file mode 100644 index 676a79f..0000000 --- a/packages/parser-free/logs/processing_activity.jsonl +++ /dev/null @@ -1,14 +0,0 @@ -{"timestamp": "2026-03-18T12:32:34.490093", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:32:34.493341", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:32:34.502456", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:32:34.510783", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:32:34.518456", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:32:34.522150", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:32:34.527303", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.790615", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.794068", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.805428", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.813279", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.819899", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.826404", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} -{"timestamp": "2026-03-18T12:34:07.830051", "event_type": "processing", "pdf_count": 0, "pages_read": 0, "transaction_count": 0, "duplicate_count": 0, "output_formats": ["csv", "json", "excel"], "duration_seconds": 0.0} From b21bbaa4fc55a448c273d0e2c3f3693348a88ef2 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 15:02:44 +0000 Subject: [PATCH 4/5] style: fix isort ordering in service_registry.py --- .../src/bankstatements_core/services/service_registry.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/services/service_registry.py b/packages/parser-core/src/bankstatements_core/services/service_registry.py index 54d179d..52acfaa 100644 --- a/packages/parser-core/src/bankstatements_core/services/service_registry.py +++ b/packages/parser-core/src/bankstatements_core/services/service_registry.py @@ -95,9 +95,7 @@ def from_config( Fully wired ServiceRegistry instance. """ from bankstatements_core.config.column_config import get_column_names - from bankstatements_core.patterns.strategies import ( - AllFieldsDuplicateStrategy, - ) + from bankstatements_core.patterns.strategies import AllFieldsDuplicateStrategy from bankstatements_core.processor import find_matching_columns from bankstatements_core.services.duplicate_detector import ( DuplicateDetectionService, From 3ca3969b67c8fdcb68b4de5c994eb2d04fb1c5d6 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 15:09:42 +0000 Subject: [PATCH 5/5] style: fix isort ordering in processor.py --- packages/parser-core/src/bankstatements_core/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 5ee3358..4f524f6 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -22,13 +22,13 @@ from bankstatements_core.services.pdf_processing_orchestrator import ( PDFProcessingOrchestrator, ) +from bankstatements_core.services.service_registry import ServiceRegistry from bankstatements_core.services.sorting_service import ( ChronologicalSortingStrategy, NoSortingStrategy, TransactionSortingService, ) from bankstatements_core.services.transaction_filter import TransactionFilterService -from bankstatements_core.services.service_registry import ServiceRegistry from bankstatements_core.services.transaction_processing_orchestrator import ( TransactionProcessingOrchestrator, )