From 85ac5efc3f4eadc05506f63d920b4ccad2284de3 Mon Sep 17 00:00:00 2001 From: longieirl Date: Tue, 24 Mar 2026 23:09:24 +0000 Subject: [PATCH 01/14] feat(#32): make ScoringConfig injectable in TemplateDetector - Replace global DETECTOR_WEIGHTS / MIN_CONFIDENCE_THRESHOLD refs with self._scoring.weight_for() / self._scoring.min_confidence_threshold - Extract _run_scoring() private helper (scoring loop) from detect_template() - Add get_detection_explanation() -> DetectionExplanation for diagnostic introspection - Export ScoringConfig and DetectionExplanation from templates/__init__.py - Add 4 new tests: threshold boundary pass/fail, weight ordering, tie-break reason --- .../bankstatements_core/templates/__init__.py | 8 +- .../templates/template_detector.py | 335 +++++++++++++++--- .../tests/templates/test_template_detector.py | 270 +++++++++++++- 3 files changed, 554 insertions(+), 59 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/templates/__init__.py b/packages/parser-core/src/bankstatements_core/templates/__init__.py index 97db2ed..f50f9d0 100644 --- a/packages/parser-core/src/bankstatements_core/templates/__init__.py +++ b/packages/parser-core/src/bankstatements_core/templates/__init__.py @@ -2,7 +2,11 @@ from __future__ import annotations -from bankstatements_core.templates.template_detector import TemplateDetector +from bankstatements_core.templates.template_detector import ( + DetectionExplanation, + ScoringConfig, + TemplateDetector, +) from bankstatements_core.templates.template_model import ( BankTemplate, TemplateDetectionConfig, @@ -13,6 +17,8 @@ __all__ = [ "BankTemplate", + "DetectionExplanation", + "ScoringConfig", "TemplateDetectionConfig", "TemplateExtractionConfig", "TemplateProcessingConfig", diff --git a/packages/parser-core/src/bankstatements_core/templates/template_detector.py b/packages/parser-core/src/bankstatements_core/templates/template_detector.py index 4f4bf29..6791451 100644 --- a/packages/parser-core/src/bankstatements_core/templates/template_detector.py +++ b/packages/parser-core/src/bankstatements_core/templates/template_detector.py @@ -5,6 +5,7 @@ import logging import re from collections import defaultdict +from dataclasses import dataclass, field from pathlib import Path from pdfplumber.page import Page @@ -25,35 +26,107 @@ logger = logging.getLogger(__name__) -# Detector weights for Phase 2 aggregated scoring -# Higher weight = more reliable/specific detector -DETECTOR_WEIGHTS = { - "IBAN": 2.0, # Most reliable for bank statements (IBAN is unique) - "CardNumber": 2.0, # NEW: Most reliable for credit card statements - "LoanReference": 2.0, # NEW: Most reliable for loan statements - "ColumnHeader": 1.5, # Strong indicator of template structure - "Header": 1.0, # Baseline weight - "Filename": 0.8, # Less reliable (users can rename files) - "Exclusion": 0.0, # Binary filter, not used in scoring -} - -# Minimum aggregate confidence required to select a template -# Prevents weak matches from being selected -# Threshold of 0.6 allows single detector matches (e.g., Filename @ 0.85 * 0.8 = 0.68) -# while rejecting very weak matches (< 0.6) -MIN_CONFIDENCE_THRESHOLD = 0.6 + +@dataclass(frozen=True) +class ScoringConfig: + """Injectable scoring policy for TemplateDetector. + + Controls detector weights and the minimum aggregate confidence required + to select a template. Use ScoringConfig.default() for production behaviour. + + Attributes: + weights: Mapping of detector name to score multiplier. Unknown detector + names fall back to 1.0. + min_confidence_threshold: Minimum weighted aggregate score a template + must reach to be selected (exclusive lower bound + vs default; >= threshold passes). + """ + + weights: dict[str, float] + min_confidence_threshold: float + + def __post_init__(self) -> None: + if not 0.0 < self.min_confidence_threshold <= 1.0: + raise ValueError( + f"min_confidence_threshold must be in (0.0, 1.0], " + f"got {self.min_confidence_threshold}" + ) + for name, w in self.weights.items(): + if w < 0.0: + raise ValueError( + f"Weight for detector '{name}' must be >= 0.0, got {w}" + ) + + @classmethod + def default(cls) -> "ScoringConfig": + """Production scoring — used when no config is injected.""" + return cls( + weights={ + "IBAN": 2.0, + "CardNumber": 2.0, + "LoanReference": 2.0, + "ColumnHeader": 1.5, + "Header": 1.0, + "Filename": 0.8, + "Exclusion": 0.0, + }, + min_confidence_threshold=0.6, + ) + + def weight_for(self, detector_name: str) -> float: + """Return the weight for a detector name, defaulting to 1.0 for unknowns.""" + return self.weights.get(detector_name, 1.0) + + +@dataclass +class DetectionExplanation: + """Structured account of why a template was (or was not) selected. + + Returned by TemplateDetector.get_detection_explanation(). Useful for + debugging mis-detections and writing tests that assert on scoring + outcomes without mocking individual detectors. + + Attributes: + selected_template_id: ID of the selected template, or None if default used. + selected_score: Weighted aggregate score of the selected template (0.0 if + default fallback was used). + threshold: The min_confidence_threshold that was applied. + passed_threshold: True if selected_score >= threshold. + per_template_scores: Weighted aggregate score for every candidate template. + per_template_breakdown: Human-readable per-detector contribution strings, + e.g. {"aib": ["IBAN=0.95*2.0=1.90", ...]}. + tie_broken: True if multiple templates shared the top score. + tie_winner_reason: One of "IBAN match", "max confidence", "alphabetical", + or None if no tie occurred. + used_default: True when the returned template is the registry default. + default_reason: Why the default was used, or None if a real match was found. + """ + + selected_template_id: str | None + selected_score: float + threshold: float + passed_threshold: bool + per_template_scores: dict[str, float] + per_template_breakdown: dict[str, list[str]] + tie_broken: bool + tie_winner_reason: str | None + used_default: bool + default_reason: str | None class TemplateDetector: """Orchestrates multi-signal template detection using confidence-based scoring.""" - def __init__(self, registry: TemplateRegistry): + def __init__(self, registry: TemplateRegistry, scoring: ScoringConfig | None = None): """Initialize detector with template registry. Args: registry: TemplateRegistry containing available templates + scoring: Optional scoring config (weights + threshold). Defaults to + ScoringConfig.default() — production weights, threshold 0.6. """ self.registry = registry + self._scoring = scoring if scoring is not None else ScoringConfig.default() # Initialize detector chain # ExclusionDetector runs FIRST to filter out excluded templates @@ -193,43 +266,9 @@ def detect_template(self, pdf_path: Path, first_page: Page) -> BankTemplate: logger.warning("No enabled templates found, using default") return self.registry.get_default_template() - # Phase 2: Aggregate scores from all detectors - template_scores: dict[str, float] = defaultdict(float) - template_details: dict[str, list[DetectionResult]] = defaultdict(list) - excluded_templates: set[str] = set() - - for detector in self.detectors: - try: - results = detector.detect(pdf_path, first_page, templates) - if results: - for result in results: - # Skip excluded templates (confidence = 0.0) - if result.confidence == 0.0: - logger.debug( - f"Template '{result.template.id}' excluded by " - f"{result.detector_name}" - ) - # Mark as excluded - excluded_templates.add(result.template.id) - continue - - # Skip templates that were already excluded - if result.template.id in excluded_templates: - continue - - # Apply detector-specific weight - weight = DETECTOR_WEIGHTS.get(result.detector_name, 1.0) - weighted_score = result.confidence * weight - - template_scores[result.template.id] += weighted_score - template_details[result.template.id].append(result) - - except (AttributeError, ValueError, TypeError, OSError) as e: - # Expected errors: PDF access issues, text extraction failures, type errors - logger.error( - f"Error in {detector.name} detector for {pdf_path.name}: {e}" - ) - # Let unexpected errors bubble up + template_scores, template_details, excluded_templates = self._run_scoring( + pdf_path, first_page, templates + ) # Remove excluded templates valid_scores = { @@ -272,10 +311,10 @@ def detect_template(self, pdf_path: Path, first_page: Page) -> BankTemplate: best_score = valid_scores[best_template_id] # Phase 2: Check minimum confidence threshold - if best_score < MIN_CONFIDENCE_THRESHOLD: + if best_score < self._scoring.min_confidence_threshold: logger.warning( f"Best match '{best_template_id}' has confidence {best_score:.2f}, " - f"below threshold {MIN_CONFIDENCE_THRESHOLD}. Using default template." + f"below threshold {self._scoring.min_confidence_threshold}. Using default template." ) if document_type: return self.registry.get_default_for_type(document_type) @@ -308,6 +347,54 @@ def detect_template(self, pdf_path: Path, first_page: Page) -> BankTemplate: ) return best_template + def _run_scoring( + self, + pdf_path: Path, + first_page: Page, + templates: list[BankTemplate], + ) -> tuple[dict[str, float], dict[str, list[DetectionResult]], set[str]]: + """Run all detectors and aggregate weighted scores per template. + + Args: + pdf_path: Path to the PDF file. + first_page: First page of the PDF. + templates: Candidate templates to score. + + Returns: + Tuple of (template_scores, template_details, excluded_templates). + """ + template_scores: dict[str, float] = defaultdict(float) + template_details: dict[str, list[DetectionResult]] = defaultdict(list) + excluded_templates: set[str] = set() + + for detector in self.detectors: + try: + results = detector.detect(pdf_path, first_page, templates) + if results: + for result in results: + if result.confidence == 0.0: + logger.debug( + f"Template '{result.template.id}' excluded by " + f"{result.detector_name}" + ) + excluded_templates.add(result.template.id) + continue + + if result.template.id in excluded_templates: + continue + + weight = self._scoring.weight_for(result.detector_name) + weighted_score = result.confidence * weight + template_scores[result.template.id] += weighted_score + template_details[result.template.id].append(result) + + except (AttributeError, ValueError, TypeError, OSError) as e: + logger.error( + f"Error in {detector.name} detector for {pdf_path.name}: {e}" + ) + + return dict(template_scores), dict(template_details), excluded_templates + def _break_tie( self, tied_templates: list[str], @@ -354,6 +441,140 @@ def _break_tie( logger.debug(f"Tie-breaker: {alphabetical_first} (alphabetically first)") return alphabetical_first + def get_detection_explanation( + self, pdf_path: Path, first_page: Page + ) -> DetectionExplanation: + """Return a structured explanation of template scoring for a given PDF. + + Runs the same scoring logic as detect_template() but returns full + diagnostic detail rather than just the winning template. + + Args: + pdf_path: Path to the PDF file. + first_page: First page of the PDF. + + Returns: + DetectionExplanation with per-template scores, breakdown, and + tie-break metadata. + """ + document_type = self._classify_document_type(first_page) + + if document_type: + templates = self.registry.get_templates_by_type(document_type) + if not templates: + templates = self.registry.get_all_templates() + else: + templates = self.registry.get_all_templates() + + threshold = self._scoring.min_confidence_threshold + + if not templates: + return DetectionExplanation( + selected_template_id=None, + selected_score=0.0, + threshold=threshold, + passed_threshold=False, + per_template_scores={}, + per_template_breakdown={}, + tie_broken=False, + tie_winner_reason=None, + used_default=True, + default_reason="no templates available", + ) + + template_scores, template_details, excluded_templates = self._run_scoring( + pdf_path, first_page, templates + ) + + valid_scores = { + tid: score + for tid, score in template_scores.items() + if tid not in excluded_templates and score > 0.0 + } + + per_template_breakdown: dict[str, list[str]] = { + tid: [ + f"{d.detector_name}={d.confidence:.2f}" + f"*{self._scoring.weight_for(d.detector_name):.1f}" + f"={d.confidence * self._scoring.weight_for(d.detector_name):.2f}" + for d in sorted(details, key=lambda x: x.confidence, reverse=True) + ] + for tid, details in template_details.items() + } + + if not valid_scores: + return DetectionExplanation( + selected_template_id=None, + selected_score=0.0, + threshold=threshold, + passed_threshold=False, + per_template_scores=valid_scores, + per_template_breakdown=per_template_breakdown, + tie_broken=False, + tie_winner_reason=None, + used_default=True, + default_reason="no templates scored above zero", + ) + + best_template_id = max(valid_scores, key=lambda x: valid_scores[x]) + best_score = valid_scores[best_template_id] + + if best_score < threshold: + return DetectionExplanation( + selected_template_id=best_template_id, + selected_score=best_score, + threshold=threshold, + passed_threshold=False, + per_template_scores=valid_scores, + per_template_breakdown=per_template_breakdown, + tie_broken=False, + tie_winner_reason=None, + used_default=True, + default_reason=f"best score {best_score:.2f} below threshold {threshold}", + ) + + tied_templates = [ + tid for tid, score in valid_scores.items() if score == best_score + ] + tie_broken = len(tied_templates) > 1 + tie_winner_reason: str | None = None + + if tie_broken: + # Determine reason without re-running full tie-break + has_iban = { + tid: any(d.detector_name == "IBAN" for d in template_details.get(tid, [])) + for tid in tied_templates + } + if any(has_iban.values()): + tie_winner_reason = "IBAN match" + else: + max_confs = { + tid: max( + (d.confidence for d in template_details.get(tid, [])), default=0.0 + ) + for tid in tied_templates + } + top_max = max(max_confs.values()) + winners_by_max = [tid for tid, c in max_confs.items() if c == top_max] + if len(winners_by_max) == 1: + tie_winner_reason = "max confidence" + else: + tie_winner_reason = "alphabetical" + best_template_id = self._break_tie(tied_templates, template_details) + + return DetectionExplanation( + selected_template_id=best_template_id, + selected_score=best_score, + threshold=threshold, + passed_threshold=True, + per_template_scores=valid_scores, + per_template_breakdown=per_template_breakdown, + tie_broken=tie_broken, + tie_winner_reason=tie_winner_reason, + used_default=False, + default_reason=None, + ) + def force_template(self, template_id: str) -> BankTemplate | None: """Force use of specific template by ID. diff --git a/packages/parser-core/tests/templates/test_template_detector.py b/packages/parser-core/tests/templates/test_template_detector.py index e71ff62..5f58153 100644 --- a/packages/parser-core/tests/templates/test_template_detector.py +++ b/packages/parser-core/tests/templates/test_template_detector.py @@ -8,7 +8,11 @@ import pytest from bankstatements_core.templates.detectors.base import DetectionResult -from bankstatements_core.templates.template_detector import TemplateDetector +from bankstatements_core.templates.template_detector import ( + DetectionExplanation, + ScoringConfig, + TemplateDetector, +) from bankstatements_core.templates.template_model import ( BankTemplate, TemplateDetectionConfig, @@ -808,3 +812,267 @@ def test_detect_uses_type_specific_default( # Should call get_default_for_type with "credit_card_statement" mock_registry.get_default_for_type.assert_called_with("credit_card_statement") assert result == credit_card_default + + # ----------------------------------------------------------------------- + # ScoringConfig tests + # ----------------------------------------------------------------------- + + @patch( + "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect" + ) + @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect") + @patch( + "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect" + ) + def test_threshold_boundary_passes( + self, + mock_column_detect, + mock_header_detect, + mock_filename_detect, + mock_loan_detect, + mock_card_detect, + mock_iban_detect, + mock_exclusion_detect, + mock_registry, + mock_page, + ): + """Filename score exactly at threshold (0.75 * 0.8 = 0.60 >= 0.60) selects template.""" + revolut_template = mock_registry.get_all_templates()[1] + + mock_exclusion_detect.return_value = [] + mock_iban_detect.return_value = [] + mock_card_detect.return_value = [] + mock_loan_detect.return_value = [] + mock_header_detect.return_value = [] + mock_column_detect.return_value = [] + mock_filename_detect.return_value = [ + DetectionResult( + template=revolut_template, + confidence=0.75, # 0.75 * 0.8 = 0.60 — exactly at threshold + detector_name="Filename", + match_details={}, + ) + ] + + scoring = ScoringConfig( + weights={"Filename": 0.8}, + min_confidence_threshold=0.60, + ) + detector = TemplateDetector(mock_registry, scoring=scoring) + result = detector.detect_template(Path("test.pdf"), mock_page) + + assert result == revolut_template + + @patch( + "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect" + ) + @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect") + @patch( + "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect" + ) + def test_threshold_boundary_fails( + self, + mock_column_detect, + mock_header_detect, + mock_filename_detect, + mock_loan_detect, + mock_card_detect, + mock_iban_detect, + mock_exclusion_detect, + mock_registry, + mock_page, + ): + """Filename score just below threshold (0.74 * 0.8 = 0.592 < 0.60) uses default.""" + revolut_template = mock_registry.get_all_templates()[1] + default_template = mock_registry.get_default_template() + + mock_exclusion_detect.return_value = [] + mock_iban_detect.return_value = [] + mock_card_detect.return_value = [] + mock_loan_detect.return_value = [] + mock_header_detect.return_value = [] + mock_column_detect.return_value = [] + mock_filename_detect.return_value = [ + DetectionResult( + template=revolut_template, + confidence=0.74, # 0.74 * 0.8 = 0.592 — just below threshold + detector_name="Filename", + match_details={}, + ) + ] + + scoring = ScoringConfig( + weights={"Filename": 0.8}, + min_confidence_threshold=0.60, + ) + detector = TemplateDetector(mock_registry, scoring=scoring) + result = detector.detect_template(Path("test.pdf"), mock_page) + + assert result == default_template + + @patch( + "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect" + ) + @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect") + @patch( + "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect" + ) + def test_weight_ordering_iban_beats_column_header( + self, + mock_column_detect, + mock_header_detect, + mock_filename_detect, + mock_loan_detect, + mock_card_detect, + mock_iban_detect, + mock_exclusion_detect, + mock_registry, + mock_page, + ): + """IBAN weight 2.0 (0.5*2.0=1.0) beats ColumnHeader weight 1.5 (0.6*1.5=0.9).""" + aib_template = mock_registry.get_all_templates()[0] + revolut_template = mock_registry.get_all_templates()[1] + + mock_exclusion_detect.return_value = [] + mock_card_detect.return_value = [] + mock_loan_detect.return_value = [] + mock_filename_detect.return_value = [] + mock_header_detect.return_value = [] + # AIB scores via IBAN: 0.5 * 2.0 = 1.0 + mock_iban_detect.return_value = [ + DetectionResult( + template=aib_template, + confidence=0.5, + detector_name="IBAN", + match_details={}, + ) + ] + # Revolut scores via ColumnHeader: 0.6 * 1.5 = 0.9 + mock_column_detect.return_value = [ + DetectionResult( + template=revolut_template, + confidence=0.6, + detector_name="ColumnHeader", + match_details={}, + ) + ] + + scoring = ScoringConfig.default() + detector = TemplateDetector(mock_registry, scoring=scoring) + result = detector.detect_template(Path("test.pdf"), mock_page) + + # AIB wins: 1.0 > 0.9 + assert result == aib_template + + @patch( + "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect" + ) + @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect") + @patch( + "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect" + ) + @patch( + "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect" + ) + def test_get_detection_explanation_tie_break_reason( + self, + mock_column_detect, + mock_header_detect, + mock_filename_detect, + mock_loan_detect, + mock_card_detect, + mock_iban_detect, + mock_exclusion_detect, + mock_registry, + mock_page, + ): + """get_detection_explanation populates tie_winner_reason when a tie is broken.""" + aib_template = mock_registry.get_all_templates()[0] + revolut_template = mock_registry.get_all_templates()[1] + + mock_exclusion_detect.return_value = [] + mock_card_detect.return_value = [] + mock_loan_detect.return_value = [] + mock_filename_detect.return_value = [] + # Create a tie: AIB via IBAN (0.25*2.0=0.5) + Header (0.3*1.0=0.3) = 0.8 + # Revolut via Header (0.8*1.0=0.8) = 0.8 + mock_iban_detect.return_value = [ + DetectionResult( + template=aib_template, + confidence=0.25, + detector_name="IBAN", + match_details={}, + ) + ] + mock_header_detect.return_value = [ + DetectionResult( + template=aib_template, + confidence=0.3, + detector_name="Header", + match_details={}, + ), + DetectionResult( + template=revolut_template, + confidence=0.8, + detector_name="Header", + match_details={}, + ), + ] + mock_column_detect.return_value = [] + + detector = TemplateDetector(mock_registry) + explanation = detector.get_detection_explanation(Path("test.pdf"), mock_page) + + assert isinstance(explanation, DetectionExplanation) + assert explanation.tie_broken is True + assert explanation.tie_winner_reason == "IBAN match" + assert explanation.selected_template_id == "aib" + assert explanation.passed_threshold is True + assert explanation.used_default is False + assert "aib" in explanation.per_template_scores + assert "revolut" in explanation.per_template_scores From 55b66434741a6f848ed051afa4a5a8b503a6da9b Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 07:54:56 +0000 Subject: [PATCH 02/14] feat(23-01): wire PDFTableExtractor.extract() and facade to return ExtractionResult - PDFTableExtractor.extract() now returns ExtractionResult (not 3-tuple) - Credit card early-return path returns ExtractionResult with warnings entry - Normal return path wraps dicts_to_transactions(rows) in ExtractionResult - extraction_facade.extract_tables_from_pdf() return annotation -> ExtractionResult - All 6 extraction-layer test files updated: field access replaces tuple unpacking - ExtractionOrchestrator.extract_from_pdf() unpacks ExtractionResult, re-packs as tuple for upstream callers (Rule 3 - blocking fix) - All mock return values in test_excluded_files_logging, test_processor, test_processor_refactored_methods, test_extraction_orchestrator updated to ExtractionResult - 1360 tests pass, 9 skipped (all expected) --- .../extraction/extraction_facade.py | 6 +- .../extraction/pdf_extractor.py | 22 ++++- .../services/extraction_orchestrator.py | 5 +- .../tests/extraction/test_page_skipping.py | 53 ++++++------ .../tests/extraction/test_pdf_extractor.py | 49 +++++------ .../services/test_extraction_orchestrator.py | 33 +++++--- .../templates/test_template_integration.py | 6 +- .../tests/test_credit_card_detection.py | 81 ++++++++++++------- .../tests/test_excluded_files_logging.py | 64 +++++++++++---- .../tests/test_pdf_table_extractor.py | 72 +++++++++-------- packages/parser-core/tests/test_processor.py | 22 ++++- .../test_processor_refactored_methods.py | 23 ++++-- .../test_transaction_data_no_blocking.py | 44 +++++----- 13 files changed, 302 insertions(+), 178 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py index 57e3d34..7fdc3d6 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py +++ b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py @@ -12,6 +12,7 @@ from bankstatements_core.config.column_config import DEFAULT_COLUMNS from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y +from bankstatements_core.domain import ExtractionResult if TYPE_CHECKING: from bankstatements_core.extraction.row_classifiers import RowClassifier @@ -70,7 +71,7 @@ def extract_tables_from_pdf( enable_page_validation: bool | None = None, enable_header_check: bool | None = None, template: "BankTemplate" | None = None, -) -> tuple[list[dict], int, str | None]: +) -> ExtractionResult: """ Extract table data from PDF within specified bounds (facade function). @@ -87,7 +88,8 @@ def extract_tables_from_pdf( template: Optional BankTemplate to use for extraction configuration Returns: - Tuple of (extracted rows, number of pages, IBAN if found) + ExtractionResult containing extracted transactions, page count, IBAN, + source file path, and any document-level warnings """ from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index d0e2652..549e6c2 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -22,6 +22,8 @@ StatefulPageRowProcessor, extract_filename_date, ) +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions logger = logging.getLogger(__name__) @@ -72,14 +74,15 @@ def __init__( else: self._pdf_reader = pdf_reader - def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]: + def extract(self, pdf_path: Path) -> ExtractionResult: """Extract table data from PDF file. Args: pdf_path: Path to the PDF file Returns: - Tuple of (extracted rows, total page count, IBAN if found) + ExtractionResult containing extracted transactions, page count, + IBAN if found, source file path, and any document-level warnings """ rows: list[dict] = [] iban = None @@ -104,7 +107,13 @@ def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]: f"Credit card statement detected in {pdf_path.name}. " f"Credit card statements are not currently supported. Skipping file." ) - return [], len(pdf.pages), None + return ExtractionResult( + transactions=[], + page_count=len(pdf.pages), + iban=None, + source_file=pdf_path, + warnings=["credit card statement detected, skipped"], + ) if iban is None and page_num == 1: iban = self._header_analyser.extract_iban(page) @@ -124,7 +133,12 @@ def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]: rows.extend(page_processor.process_page(page_rows)) - return rows, len(pdf.pages), iban + return ExtractionResult( + transactions=dicts_to_transactions(rows), + page_count=len(pdf.pages), + iban=iban, + source_file=pdf_path, + ) def _extract_page(self, page: Any, page_num: int) -> list[dict] | None: """Extract rows from a single page. diff --git a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py index 2e1efa2..4cd9b14 100644 --- a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py @@ -158,7 +158,7 @@ def extract_from_pdf( logger.info(f"Using template: {template.name} for {pdf_path.name}") # Extract transactions using template - rows, page_count, iban = extract_tables_from_pdf( + extraction_result = extract_tables_from_pdf( pdf_path, self._config.table_top_y, self._config.table_bottom_y, @@ -166,6 +166,9 @@ def extract_from_pdf( self._config.enable_dynamic_boundary, template=template, ) + rows = [tx.to_dict() for tx in extraction_result.transactions] + page_count = extraction_result.page_count + iban = extraction_result.iban # Log IBAN if found if iban: diff --git a/packages/parser-core/tests/extraction/test_page_skipping.py b/packages/parser-core/tests/extraction/test_page_skipping.py index fd9ced3..fe224d8 100644 --- a/packages/parser-core/tests/extraction/test_page_skipping.py +++ b/packages/parser-core/tests/extraction/test_page_skipping.py @@ -13,6 +13,7 @@ import pytest +from bankstatements_core.domain import ExtractionResult from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor # Test columns configuration @@ -94,13 +95,13 @@ def test_skip_page_without_headers_and_continue(self, mock_pdfplumber): extractor = PDFTableExtractor( columns=TEST_COLUMNS, enable_dynamic_boundary=True ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Assertions - assert page_count == 3 # All pages were examined - assert len(rows) == 2 # Only 2 transactions (from pages 2 and 3) - assert rows[0]["Filename"] == "test.pdf" - assert rows[1]["Filename"] == "test.pdf" + assert result.page_count == 3 # All pages were examined + assert len(result.transactions) == 2 # Only 2 transactions (from pages 2 and 3) + assert result.transactions[0].filename == "test.pdf" + assert result.transactions[1].filename == "test.pdf" @patch.dict("os.environ", {"REQUIRE_TABLE_HEADERS": "true"}) @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") @@ -137,10 +138,10 @@ def test_skip_all_pages_without_tables(self, mock_pdfplumber): extractor = PDFTableExtractor( columns=TEST_COLUMNS, enable_dynamic_boundary=True ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 2 # All pages were examined - assert len(rows) == 0 # No transactions extracted + assert result.page_count == 2 # All pages were examined + assert len(result.transactions) == 0 # No transactions extracted @patch.dict("os.environ", {"REQUIRE_TABLE_HEADERS": "true"}) @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") @@ -174,10 +175,10 @@ def test_process_all_pages_with_tables(self, mock_pdfplumber): extractor = PDFTableExtractor( columns=TEST_COLUMNS, enable_dynamic_boundary=True ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 3 - assert len(rows) == 3 # One transaction from each page + assert result.page_count == 3 + assert len(result.transactions) == 3 # One transaction from each page @patch.dict("os.environ", {"REQUIRE_TABLE_HEADERS": "true"}) @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") @@ -234,10 +235,10 @@ def test_skip_middle_page_without_table(self, mock_pdfplumber): extractor = PDFTableExtractor( columns=TEST_COLUMNS, enable_dynamic_boundary=True ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 3 # All 3 pages examined - assert len(rows) == 2 # Only pages 1 and 3 had transactions + assert result.page_count == 3 # All 3 pages examined + assert len(result.transactions) == 2 # Only pages 1 and 3 had transactions @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_page_validation_disabled_processes_all_pages(self, mock_pdfplumber): @@ -262,11 +263,11 @@ def test_page_validation_disabled_processes_all_pages(self, mock_pdfplumber): extractor = PDFTableExtractor( columns=TEST_COLUMNS, enable_page_validation=False ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 + assert result.page_count == 1 # Should process page even without headers - assert len(rows) >= 0 # May or may not have valid transactions + assert len(result.transactions) >= 0 # May or may not have valid transactions @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_validation_enabled_by_default(self, mock_pdfplumber): @@ -298,10 +299,10 @@ def test_validation_enabled_by_default(self, mock_pdfplumber): assert extractor.header_check_enabled is True # Extract and verify page is skipped due to missing headers - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 # Page was examined - assert len(rows) == 0 # Page was skipped (no headers) + assert result.page_count == 1 # Page was examined + assert len(result.transactions) == 0 # Page was skipped (no headers) @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_empty_pdf_returns_zero_rows(self, mock_pdfplumber): @@ -311,10 +312,10 @@ def test_empty_pdf_returns_zero_rows(self, mock_pdfplumber): mock_pdfplumber.return_value = mock_pdf extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/empty.pdf")) + result = extractor.extract(Path("/tmp/empty.pdf")) - assert page_count == 0 - assert len(rows) == 0 + assert result.page_count == 0 + assert len(result.transactions) == 0 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_page_with_insufficient_rows_skipped(self, mock_pdfplumber): @@ -336,8 +337,8 @@ def test_page_with_insufficient_rows_skipped(self, mock_pdfplumber): mock_cropped.extract_words.return_value = mock_words extractor = PDFTableExtractor(columns=TEST_COLUMNS, enable_page_validation=True) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 + assert result.page_count == 1 # Should be skipped due to insufficient transaction rows - assert len(rows) == 0 + assert len(result.transactions) == 0 diff --git a/packages/parser-core/tests/extraction/test_pdf_extractor.py b/packages/parser-core/tests/extraction/test_pdf_extractor.py index dffd7c8..5fa3e2b 100644 --- a/packages/parser-core/tests/extraction/test_pdf_extractor.py +++ b/packages/parser-core/tests/extraction/test_pdf_extractor.py @@ -7,6 +7,8 @@ import pytest +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor from bankstatements_core.extraction.row_post_processor import ( RowPostProcessor, @@ -189,11 +191,12 @@ def test_extract_basic_flow(self, mock_pdfplumber): enable_page_validation=False, enable_header_check=False, ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 - assert len(rows) == 1 - assert rows[0]["Filename"] == "test.pdf" + assert isinstance(result, ExtractionResult) + assert result.page_count == 1 + assert len(result.transactions) == 1 + assert result.transactions[0].filename == "test.pdf" @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_extract_with_dynamic_boundary(self, mock_pdfplumber): @@ -224,9 +227,9 @@ def test_extract_with_dynamic_boundary(self, mock_pdfplumber): enable_page_validation=False, enable_header_check=False, ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 + assert result.page_count == 1 # Crop is called for: credit card check (header), table extraction (initial + final) assert mock_page.crop.call_count >= 2 # At least initial + final for table @@ -249,11 +252,11 @@ def test_extract_with_page_validation_failure(self, mock_pdfplumber): mock_cropped.extract_words.return_value = mock_words extractor = PDFTableExtractor(columns=TEST_COLUMNS, enable_page_validation=True) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 + assert result.page_count == 1 # No valid transactions, so rows should be empty - assert len(rows) == 0 + assert len(result.transactions) == 0 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_extract_multiple_pages(self, mock_pdfplumber): @@ -289,10 +292,10 @@ def test_extract_multiple_pages(self, mock_pdfplumber): enable_page_validation=False, enable_header_check=False, ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 2 - assert len(rows) == 2 + assert result.page_count == 2 + assert len(result.transactions) == 2 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_extract_date_propagation_across_rows(self, mock_pdfplumber): @@ -329,12 +332,12 @@ def test_extract_date_propagation_across_rows(self, mock_pdfplumber): enable_page_validation=False, enable_header_check=False, ) - rows, _, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert len(rows) == 2 + assert len(result.transactions) == 2 # Both rows should have the same date (2023 is captured if x1 is provided for all words) - assert "01 Jan 2023" in rows[0]["Date"] - assert "01 Jan 2023" in rows[1]["Date"] + assert "01 Jan 2023" in result.transactions[0].date + assert "01 Jan 2023" in result.transactions[1].date def test_page_validation_constructor_parameter(self): """Test page validation setting via constructor parameter.""" @@ -431,10 +434,10 @@ def test_per_page_boundaries_applied_correctly(self, mock_pdfplumber): enable_header_check=False, ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 2 - assert len(rows) == 2 + assert result.page_count == 2 + assert len(result.transactions) == 2 # Verify crop was called with correct boundaries # Page 1 should use override (490) @@ -480,10 +483,10 @@ def test_per_page_boundaries_without_extraction_config(self, mock_pdfplumber): enable_header_check=False, ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert page_count == 1 - assert len(rows) == 1 + assert result.page_count == 1 + assert len(result.transactions) == 1 # Verify crop was called with instance defaults (300) crop_calls = [ @@ -535,7 +538,7 @@ def test_per_page_header_check_top_y_override(self, mock_pdfplumber): enable_header_check=True, # Enable header check ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Verify header area crop was called with override value (450) header_crop_calls = [ diff --git a/packages/parser-core/tests/services/test_extraction_orchestrator.py b/packages/parser-core/tests/services/test_extraction_orchestrator.py index 890a798..24aaf9e 100644 --- a/packages/parser-core/tests/services/test_extraction_orchestrator.py +++ b/packages/parser-core/tests/services/test_extraction_orchestrator.py @@ -8,6 +8,8 @@ from unittest.mock import MagicMock, patch from bankstatements_core.config.processor_config import ExtractionConfig +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.services.extraction_orchestrator import ExtractionOrchestrator @@ -34,10 +36,11 @@ def test_extract_from_pdf_success(self, mock_extract): pdf_path.write_text("fake pdf") # Mock successful extraction - mock_extract.return_value = ( - [{"Date": "01/01/23", "Details": "Test"}], - 5, - "IE12BOFI90000112345", + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test"}]), + page_count=5, + iban="IE12BOFI90000112345", + source_file=pdf_path, ) rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path) @@ -59,7 +62,12 @@ def test_extract_from_pdf_with_forced_template(self, mock_extract): mock_template = MagicMock() mock_template.name = "TestTemplate" - mock_extract.return_value = ([{"Date": "01/01/23"}], 3, None) + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([{"Date": "01/01/23"}]), + page_count=3, + iban=None, + source_file=pdf_path, + ) rows, pages, iban = self.orchestrator.extract_from_pdf( pdf_path, forced_template=mock_template @@ -79,7 +87,9 @@ def test_extract_from_pdf_no_data(self, mock_extract): pdf_path = Path(self.temp_dir) / "test.pdf" pdf_path.write_text("fake pdf") - mock_extract.return_value = ([], 2, None) + mock_extract.return_value = ExtractionResult( + transactions=[], page_count=2, iban=None, source_file=pdf_path + ) rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path) @@ -95,13 +105,14 @@ def test_extract_from_pdf_returns_rows_as_is(self, mock_extract): pdf_path = Path(self.temp_dir) / "test.pdf" pdf_path.write_text("fake pdf") - mock_extract.return_value = ( - [ + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([ {"Date": "01/01/23", "Details": "Test1"}, {"Date": "02/01/23", "Details": "Test2"}, - ], - 3, - None, + ]), + page_count=3, + iban=None, + source_file=pdf_path, ) rows, _, _ = self.orchestrator.extract_from_pdf(pdf_path) diff --git a/packages/parser-core/tests/templates/test_template_integration.py b/packages/parser-core/tests/templates/test_template_integration.py index fd25a7d..63b01ec 100644 --- a/packages/parser-core/tests/templates/test_template_integration.py +++ b/packages/parser-core/tests/templates/test_template_integration.py @@ -239,7 +239,7 @@ def test_revolut_pdf_extracts_all_transactions(self): "input/account-statement_2025-01-01_2026-01-30_en-ie_d04719.pdf" ) - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( pdf_path, table_top_y=revolut_template.extraction.table_top_y, table_bottom_y=revolut_template.extraction.table_bottom_y, @@ -250,9 +250,9 @@ def test_revolut_pdf_extracts_all_transactions(self): # Should extract 67+ transactions from pages 2-4 # (Page 1 and 5 have different structure and are skipped) - assert len(rows) >= 67 + assert len(result.transactions) >= 67 # Verify we have January 2025 transactions (not missing first 24) - dates = [row.get("Date", "") for row in rows if row.get("Date")] + dates = [tx.date for tx in result.transactions if tx.date] jan_transactions = [d for d in dates if "Jan 2025" in d] assert len(jan_transactions) >= 4 # At least 4 January transactions diff --git a/packages/parser-core/tests/test_credit_card_detection.py b/packages/parser-core/tests/test_credit_card_detection.py index d4d7160..85935a6 100644 --- a/packages/parser-core/tests/test_credit_card_detection.py +++ b/packages/parser-core/tests/test_credit_card_detection.py @@ -7,6 +7,7 @@ import pytest +from bankstatements_core.domain import ExtractionResult from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor # Test columns configuration @@ -40,12 +41,13 @@ def test_detect_credit_card_statement_with_card_number(self, mock_pdfplumber): """ extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/credit_card.pdf")) + result = extractor.extract(Path("/tmp/credit_card.pdf")) # Should return empty rows (skipped) - assert len(rows) == 0 - assert page_count == 1 # Still counts the pages - assert iban is None + assert isinstance(result, ExtractionResult) + assert len(result.transactions) == 0 + assert result.page_count == 1 # Still counts the pages + assert result.iban is None @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_detect_credit_card_statement_case_insensitive(self, mock_pdfplumber): @@ -67,9 +69,9 @@ def test_detect_credit_card_statement_case_insensitive(self, mock_pdfplumber): mock_page.extract_text.return_value = f"Statement\n{text}\nDetails" extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert len(rows) == 0, f"Failed to detect: {text}" + assert len(result.transactions) == 0, f"Failed to detect: {text}" @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_detect_credit_card_statement_with_credit_limit(self, mock_pdfplumber): @@ -87,11 +89,11 @@ def test_detect_credit_card_statement_with_credit_limit(self, mock_pdfplumber): """ extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/credit_card.pdf")) + result = extractor.extract(Path("/tmp/credit_card.pdf")) - assert len(rows) == 0 - assert page_count == 1 - assert iban is None + assert len(result.transactions) == 0 + assert result.page_count == 1 + assert result.iban is None @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_detect_credit_card_statement_with_credit_card_text(self, mock_pdfplumber): @@ -107,10 +109,10 @@ def test_detect_credit_card_statement_with_credit_card_text(self, mock_pdfplumbe """ extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/credit_card.pdf")) + result = extractor.extract(Path("/tmp/credit_card.pdf")) - assert len(rows) == 0 - assert page_count == 1 + assert len(result.transactions) == 0 + assert result.page_count == 1 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_detect_credit_card_statement_with_visa(self, mock_pdfplumber): @@ -126,10 +128,10 @@ def test_detect_credit_card_statement_with_visa(self, mock_pdfplumber): """ extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/visa.pdf")) + result = extractor.extract(Path("/tmp/visa.pdf")) - assert len(rows) == 0 - assert page_count == 1 + assert len(result.transactions) == 0 + assert result.page_count == 1 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_detect_credit_card_statement_with_mastercard(self, mock_pdfplumber): @@ -145,10 +147,10 @@ def test_detect_credit_card_statement_with_mastercard(self, mock_pdfplumber): """ extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/mastercard.pdf")) + result = extractor.extract(Path("/tmp/mastercard.pdf")) - assert len(rows) == 0 - assert page_count == 1 + assert len(result.transactions) == 0 + assert result.page_count == 1 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_all_credit_card_patterns(self, mock_pdfplumber): @@ -171,9 +173,9 @@ def test_all_credit_card_patterns(self, mock_pdfplumber): mock_page.extract_text.return_value = f"Header\n{text}\nFooter" extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) - assert len(rows) == 0, f"Failed to detect with pattern: {pattern_name}" + assert len(result.transactions) == 0, f"Failed to detect with pattern: {pattern_name}" @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_does_not_detect_false_positives(self, mock_pdfplumber): @@ -206,10 +208,10 @@ def test_does_not_detect_false_positives(self, mock_pdfplumber): mock_cropped.extract_words.return_value = mock_words extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/bank.pdf")) + result = extractor.extract(Path("/tmp/bank.pdf")) # Should process normally - assert page_count == 1 + assert result.page_count == 1 # May or may not have rows depending on validation, but shouldn't be empty due to CC detection # The key is it didn't skip due to credit card detection @@ -231,10 +233,10 @@ def test_credit_card_detection_with_extraction_error(self, mock_pdfplumber): extractor = PDFTableExtractor(columns=TEST_COLUMNS) # Should not crash, should continue processing - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Should complete without raising exception - assert page_count == 1 + assert result.page_count == 1 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_credit_card_detection_only_checks_first_page(self, mock_pdfplumber): @@ -260,10 +262,10 @@ def test_credit_card_detection_only_checks_first_page(self, mock_pdfplumber): mock_cropped2.extract_words.return_value = [] extractor = PDFTableExtractor(columns=TEST_COLUMNS) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Should process both pages (not skipped) - assert page_count == 2 + assert result.page_count == 2 def test_is_credit_card_statement_method_directly(self): """Test the _is_credit_card_statement method directly.""" @@ -331,3 +333,28 @@ def test_visa_in_transaction_not_detected(self): ) is False ) + + @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") + def test_credit_card_early_return_produces_extraction_result_with_warning( + self, mock_pdfplumber + ): + """Test that credit card detection returns ExtractionResult with warning message.""" + mock_pdf = MagicMock() + mock_page = MagicMock() + mock_page.width = 612 + mock_pdf.pages = [mock_page] + mock_pdfplumber.return_value = mock_pdf + + # Mock the cropped header area to return credit card text + mock_header = MagicMock() + mock_header.extract_text.return_value = "Statement for Card Number 1234" + mock_page.crop.return_value = mock_header + + extractor = PDFTableExtractor(columns=TEST_COLUMNS) + result = extractor.extract(Path("/tmp/credit_card.pdf")) + + assert isinstance(result, ExtractionResult) + assert len(result.transactions) == 0 + assert result.iban is None + assert len(result.warnings) > 0 + assert "credit card" in result.warnings[0].lower() diff --git a/packages/parser-core/tests/test_excluded_files_logging.py b/packages/parser-core/tests/test_excluded_files_logging.py index 8ab1d55..38492df 100644 --- a/packages/parser-core/tests/test_excluded_files_logging.py +++ b/packages/parser-core/tests/test_excluded_files_logging.py @@ -10,6 +10,8 @@ import pytest from bankstatements_core.config.processor_config import ProcessorConfig +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor @@ -40,7 +42,11 @@ def test_excluded_files_json_created(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - mock_extract.return_value = ([], 1, None) # Empty rows, 1 page, no IBAN + mock_extract.return_value = ExtractionResult( + transactions=[], page_count=1, iban=None, + source_file=Path("credit_card.pdf"), + warnings=["credit card statement detected, skipped"], + ) processor = create_test_processor(input_dir, output_dir) processor.run() @@ -77,10 +83,13 @@ def test_excluded_files_json_not_created_when_no_exclusions(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - mock_extract.return_value = ( - [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}], - 1, - "IE29AIBK93115212345678", + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([ + {"Date": "01/12/23", "Details": "Test", "Debit €": "100"} + ]), + page_count=1, + iban="IE29AIBK93115212345678", + source_file=Path("bank_statement.pdf"), ) processor = create_test_processor(input_dir, output_dir) @@ -107,7 +116,11 @@ def test_excluded_files_multiple_exclusions(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - mock_extract.return_value = ([], 2, None) + mock_extract.return_value = ExtractionResult( + transactions=[], page_count=2, iban=None, + source_file=Path("credit_card.pdf"), + warnings=["credit card statement detected, skipped"], + ) processor = create_test_processor(input_dir, output_dir) processor.run() @@ -146,12 +159,19 @@ def test_excluded_files_mixed_scenarios(self): # Mock PDFTableExtractor to return different results def mock_extract_side_effect(pdf_path, *args, **kwargs): if "credit_card" in str(pdf_path): - return ([], 1, None) # Excluded + return ExtractionResult( + transactions=[], page_count=1, iban=None, + source_file=pdf_path, + warnings=["credit card statement detected, skipped"], + ) else: - return ( - [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}], - 1, - "IE29AIBK93115212345678", + return ExtractionResult( + transactions=dicts_to_transactions([ + {"Date": "01/12/23", "Details": "Test", "Debit €": "100"} + ]), + page_count=1, + iban="IE29AIBK93115212345678", + source_file=pdf_path, ) with patch( @@ -187,7 +207,11 @@ def test_excluded_files_json_structure(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - mock_extract.return_value = ([], 3, None) # 3 pages + mock_extract.return_value = ExtractionResult( + transactions=[], page_count=3, iban=None, + source_file=Path("test.pdf"), + warnings=["credit card statement detected, skipped"], + ) processor = create_test_processor(input_dir, output_dir) processor.run() @@ -302,7 +326,10 @@ def test_pdf_with_iban_not_excluded(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - mock_extract.return_value = ([], 2, "IE29AIBK93115212345678") + mock_extract.return_value = ExtractionResult( + transactions=[], page_count=2, iban="IE29AIBK93115212345678", + source_file=Path("bank_statement.pdf"), + ) processor = create_test_processor(input_dir, output_dir) processor.run() @@ -333,10 +360,17 @@ def test_mixed_with_iban_and_without_iban(self): def mock_extract_side_effect(pdf_path, *args, **kwargs): if "bank_with_iban" in str(pdf_path): # Has IBAN, empty rows (should NOT be excluded) - return ([], 2, "IE29AIBK93115212345678") + return ExtractionResult( + transactions=[], page_count=2, iban="IE29AIBK93115212345678", + source_file=pdf_path, + ) else: # No IBAN, empty rows (SHOULD be excluded) - return ([], 1, None) + return ExtractionResult( + transactions=[], page_count=1, iban=None, + source_file=pdf_path, + warnings=["credit card statement detected, skipped"], + ) with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" diff --git a/packages/parser-core/tests/test_pdf_table_extractor.py b/packages/parser-core/tests/test_pdf_table_extractor.py index a9f8414..d515393 100644 --- a/packages/parser-core/tests/test_pdf_table_extractor.py +++ b/packages/parser-core/tests/test_pdf_table_extractor.py @@ -7,6 +7,8 @@ from unittest.mock import MagicMock, Mock, patch from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.pdf_table_extractor import ( DEFAULT_COLUMNS, analyze_content_density, @@ -67,29 +69,29 @@ def test_extract_tables_with_date_propagation(self, mock_pdfplumber): # Test the extraction test_pdf_path = Path("/tmp/test.pdf") - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( test_pdf_path, enable_page_validation=False, enable_header_check=False ) # Verify results - self.assertEqual(page_count, 1) - self.assertEqual(len(rows), 3) + self.assertEqual(result.page_count, 1) + self.assertEqual(len(result.transactions), 3) # Check first transaction (has original date) - self.assertEqual(rows[0]["Date"].strip(), "01 Jan 2024") - self.assertEqual(rows[0]["Details"].strip(), "Salary Payment") - self.assertEqual(rows[0]["Credit €"].strip(), "3000.00") - self.assertEqual(rows[0]["Filename"], "test.pdf") + self.assertEqual(result.transactions[0].date.strip(), "01 Jan 2024") + self.assertEqual(result.transactions[0].details.strip(), "Salary Payment") + self.assertEqual(result.transactions[0].credit.strip(), "3000.00") + self.assertEqual(result.transactions[0].filename, "test.pdf") # Check second transaction (should inherit date from first) - self.assertEqual(rows[1]["Date"].strip(), "01 Jan 2024") # Inherited! - self.assertEqual(rows[1]["Details"].strip(), "Coffee Shop") - self.assertEqual(rows[1]["Debit €"].strip(), "5.50") + self.assertEqual(result.transactions[1].date.strip(), "01 Jan 2024") # Inherited! + self.assertEqual(result.transactions[1].details.strip(), "Coffee Shop") + self.assertEqual(result.transactions[1].debit.strip(), "5.50") # Check third transaction (has new date) - self.assertEqual(rows[2]["Date"].strip(), "02 Jan 2024") - self.assertEqual(rows[2]["Details"].strip(), "Gas Station") - self.assertEqual(rows[2]["Debit €"].strip(), "40.00") + self.assertEqual(result.transactions[2].date.strip(), "02 Jan 2024") + self.assertEqual(result.transactions[2].details.strip(), "Gas Station") + self.assertEqual(result.transactions[2].debit.strip(), "40.00") @patch("bankstatements_core.pdf_table_extractor.pdfplumber.open") def test_extract_tables_filename_tagging(self, mock_pdfplumber): @@ -117,13 +119,13 @@ def test_extract_tables_filename_tagging(self, mock_pdfplumber): # Test with specific filename - pass parameters directly instead of using env vars test_pdf_path = Path("/tmp/bank_statement_jan2024.pdf") - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( test_pdf_path, enable_page_validation=False, enable_header_check=False ) # Verify filename tagging - self.assertEqual(len(rows), 1) - self.assertEqual(rows[0]["Filename"], "bank_statement_jan2024.pdf") + self.assertEqual(len(result.transactions), 1) + self.assertEqual(result.transactions[0].filename, "bank_statement_jan2024.pdf") @patch("bankstatements_core.pdf_table_extractor.pdfplumber.open") def test_extract_tables_empty_rows_filtered(self, mock_pdfplumber): @@ -153,13 +155,13 @@ def test_extract_tables_empty_rows_filtered(self, mock_pdfplumber): mock_cropped.extract_words.return_value = mock_words test_pdf_path = Path("/tmp/test.pdf") - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( test_pdf_path, enable_page_validation=False, enable_header_check=False ) # Should only have 2 valid rows (empty row filtered out) - self.assertEqual(len(rows), 2) - self.assertTrue(all(any(row.values()) for row in rows)) + self.assertEqual(len(result.transactions), 2) + self.assertTrue(all(any(t.to_dict().values()) for t in result.transactions)) def test_classify_row_type(self): """Test row classification for transaction vs non-transaction content""" @@ -405,7 +407,7 @@ def crop_side_effect(*args): test_pdf_path = Path("/tmp/test.pdf") # Test with dynamic boundary enabled - dynamic_rows, _, iban = extract_tables_from_pdf( + dynamic_result = extract_tables_from_pdf( test_pdf_path, enable_dynamic_boundary=True, enable_page_validation=False, @@ -413,8 +415,8 @@ def crop_side_effect(*args): ) # Should have fewer rows (administrative content filtered) - self.assertEqual(len(dynamic_rows), 1) - self.assertEqual(dynamic_rows[0]["Details"].strip(), "VDC-STORE") + self.assertEqual(len(dynamic_result.transactions), 1) + self.assertEqual(dynamic_result.transactions[0].details.strip(), "VDC-STORE") run_test() @@ -636,26 +638,26 @@ def test_extract_tables_with_page_validation_enabled(self, mock_pdfplumber): test_pdf_path = Path("/tmp/test.pdf") # Test with page validation enabled - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( test_pdf_path, enable_page_validation=True ) # Should have processed only the valid page - self.assertEqual(page_count, 2) # Total pages processed - self.assertEqual(len(rows), 3) # Only rows from valid page + self.assertEqual(result.page_count, 2) # Total pages processed + self.assertEqual(len(result.transactions), 3) # Only rows from valid page # Verify all returned rows are from valid transactions - for row in rows: - self.assertTrue(any(row.values())) # Non-empty rows - self.assertEqual(row["Filename"], "test.pdf") + for tx in result.transactions: + self.assertTrue(any(tx.to_dict().values())) # Non-empty rows + self.assertEqual(tx.filename, "test.pdf") # Test with page validation disabled - rows_no_validation, _, iban = extract_tables_from_pdf( + result_no_val = extract_tables_from_pdf( test_pdf_path, enable_page_validation=False ) # Should process both pages (more rows due to invalid page content) - self.assertGreaterEqual(len(rows_no_validation), len(rows)) + self.assertGreaterEqual(len(result_no_val.transactions), len(result.transactions)) def test_analyze_content_density_empty_word_groups(self): """Test analyze_content_density with empty word groups.""" @@ -930,13 +932,13 @@ def test_extract_tables_with_header_check_enabled_no_headers(self, mock_pdfplumb test_pdf_path = Path("/tmp/test.pdf") # Should skip the page due to no headers - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( test_pdf_path, enable_dynamic_boundary=True ) # Should return empty results since page was skipped - self.assertEqual(len(rows), 0) - self.assertEqual(page_count, 1) # Still counts the page + self.assertEqual(len(result.transactions), 0) + self.assertEqual(result.page_count, 1) # Still counts the page finally: # Restore environment @@ -968,12 +970,12 @@ def test_extract_tables_static_mode_header_check(self, mock_pdfplumber): test_pdf_path = Path("/tmp/test.pdf") # Should skip page in static mode too - rows, page_count, iban = extract_tables_from_pdf( + result = extract_tables_from_pdf( test_pdf_path, enable_dynamic_boundary=False # Static mode ) # Should return empty results - self.assertEqual(len(rows), 0) + self.assertEqual(len(result.transactions), 0) finally: if old_header_check: diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py index 4d193e8..6c22ec2 100644 --- a/packages/parser-core/tests/test_processor.py +++ b/packages/parser-core/tests/test_processor.py @@ -16,6 +16,8 @@ ProcessingConfig, ProcessorConfig, ) +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.processor import ( BankStatementProcessor, calculate_column_totals, @@ -274,9 +276,15 @@ def test_processor_continues_after_pdf_error(self, mock_extract): # Mock: first PDF succeeds, second fails, third succeeds mock_extract.side_effect = [ - ([{"Date": "01 Jan 2024", "Details": "Transaction 1"}], 1, None), + ExtractionResult( + transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]), + page_count=1, iban=None, source_file=Path("good.pdf"), + ), OSError("Failed to open PDF"), - ([{"Date": "02 Jan 2024", "Details": "Transaction 2"}], 1, None), + ExtractionResult( + transactions=dicts_to_transactions([{"Date": "02 Jan 2024", "Details": "Transaction 2"}]), + page_count=1, iban=None, source_file=Path("good2.pdf"), + ), ] processor = create_test_processor(self.input_dir, self.output_dir) @@ -298,8 +306,14 @@ def test_processor_handles_empty_pdf_gracefully(self, mock_extract): # Mock: first PDF returns no rows, second has data mock_extract.side_effect = [ - ([], 1, None), # Empty PDF with 1 page - ([{"Date": "01 Jan 2024", "Details": "Transaction 1"}], 1, None), + ExtractionResult( + transactions=[], page_count=1, iban=None, source_file=Path("empty.pdf"), + warnings=["credit card statement detected, skipped"], + ), + ExtractionResult( + transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]), + page_count=1, iban=None, source_file=Path("with_data.pdf"), + ), ] processor = create_test_processor(self.input_dir, self.output_dir) diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py index 2c108aa..8e89eca 100644 --- a/packages/parser-core/tests/test_processor_refactored_methods.py +++ b/packages/parser-core/tests/test_processor_refactored_methods.py @@ -16,6 +16,8 @@ ExtractionConfig, ProcessorConfig, ) +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.processor import BankStatementProcessor # Module-level defaults to avoid B008 (function call in defaults) @@ -75,8 +77,14 @@ def test_process_all_pdfs_multiple_files(self, mock_extract): # Mock extract_tables_from_pdf to return different data mock_extract.side_effect = [ - ([{"Date": "01/01/23", "Details": "Test1"}], 5, None), - ([{"Date": "02/01/23", "Details": "Test2"}], 3, None), + ExtractionResult( + transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test1"}]), + page_count=5, iban=None, source_file=Path("file1.pdf"), + ), + ExtractionResult( + transactions=dicts_to_transactions([{"Date": "02/01/23", "Details": "Test2"}]), + page_count=3, iban=None, source_file=Path("file2.pdf"), + ), ] all_rows, pages_read, pdf_ibans = processor._process_all_pdfs() @@ -323,13 +331,14 @@ def test_run_uses_refactored_methods(self, mock_extract): ) # Mock extract_tables_from_pdf - mock_extract.return_value = ( - [ + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([ {"Date": "15/06/23", "Details": "Second", "Debit €": "200"}, {"Date": "01/01/23", "Details": "First", "Debit €": "100"}, - ], - 2, - None, + ]), + page_count=2, + iban=None, + source_file=Path("test.pdf"), ) result = processor.run() diff --git a/packages/parser-core/tests/test_transaction_data_no_blocking.py b/packages/parser-core/tests/test_transaction_data_no_blocking.py index a53e10f..ae2596f 100644 --- a/packages/parser-core/tests/test_transaction_data_no_blocking.py +++ b/packages/parser-core/tests/test_transaction_data_no_blocking.py @@ -9,6 +9,8 @@ import pytest from bankstatements_core.config.processor_config import ProcessorConfig +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions def create_test_processor(input_dir, output_dir): @@ -77,11 +79,11 @@ def crop_side_effect(bbox): } ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Should NOT be detected as credit card (VISA only in transaction) - assert iban == "IE48AIBK93408921459015" - assert page_count == 1 + assert result.iban == "IE48AIBK93408921459015" + assert result.page_count == 1 # PDF should be processed, not blocked @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") @@ -127,11 +129,11 @@ def crop_side_effect(bbox): } ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Should NOT be detected as credit card - assert iban == "IE29AIBK93115212345678" - assert page_count == 1 + assert result.iban == "IE29AIBK93115212345678" + assert result.page_count == 1 @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_iban_in_transaction_does_not_override_header_iban(self, mock_pdfplumber): @@ -176,11 +178,11 @@ def crop_side_effect(bbox): } ) - rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf")) + result = extractor.extract(Path("/tmp/test.pdf")) # Should return header IBAN, not transaction IBAN - assert iban == "IE48AIBK93408921459015" # Normalized without spaces - assert page_count == 1 + assert result.iban == "IE48AIBK93408921459015" # Normalized without spaces + assert result.page_count == 1 def test_processor_does_not_use_transaction_data_for_exclusion(self): """Test that processor only uses header data for exclusion decisions.""" @@ -197,17 +199,18 @@ def test_processor_does_not_use_transaction_data_for_exclusion(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - # Returns: rows with VISA in them, page count, IBAN from header - mock_extract.return_value = ( - [ + # Returns: ExtractionResult with VISA in transactions, IBAN from header + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([ { "Date": "11 Aug", "Details": "MOBI CLICK VISA", "Debit €": "100.00", } - ], - 1, - "IE48AIBK93408921459015", # IBAN from header + ]), + page_count=1, + iban="IE48AIBK93408921459015", + source_file=Path("test.pdf"), ) processor = create_test_processor(input_dir, output_dir) @@ -242,13 +245,14 @@ def test_all_card_brands_in_transactions_allowed(self): with patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: - mock_extract.return_value = ( - [ + mock_extract.return_value = ExtractionResult( + transactions=dicts_to_transactions([ {"Date": "11 Aug", "Details": detail} for detail in transaction_details - ], - 1, - "IE48AIBK93408921459015", # Has IBAN + ]), + page_count=1, + iban="IE48AIBK93408921459015", + source_file=Path("test.pdf"), ) processor = create_test_processor(input_dir, output_dir) From d925f029d3d270b5337b826958bd96ee1b448a57 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 07:58:09 +0000 Subject: [PATCH 03/14] test(23-02): add failing tests for ExtractionResult return from extract_from_pdf() - Update 4 tuple-unpack assertion sites to use ExtractionResult field access - Add assertIsInstance(result, ExtractionResult) to key assertions - Tests now expect extract_from_pdf() to return ExtractionResult (not tuple) --- .../services/test_extraction_orchestrator.py | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/packages/parser-core/tests/services/test_extraction_orchestrator.py b/packages/parser-core/tests/services/test_extraction_orchestrator.py index 24aaf9e..49d4c84 100644 --- a/packages/parser-core/tests/services/test_extraction_orchestrator.py +++ b/packages/parser-core/tests/services/test_extraction_orchestrator.py @@ -43,11 +43,12 @@ def test_extract_from_pdf_success(self, mock_extract): source_file=pdf_path, ) - rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path) + result = self.orchestrator.extract_from_pdf(pdf_path) - self.assertEqual(len(rows), 1) - self.assertEqual(pages, 5) - self.assertEqual(iban, "IE12BOFI90000112345") + self.assertIsInstance(result, ExtractionResult) + self.assertEqual(len(result.transactions), 1) + self.assertEqual(result.page_count, 5) + self.assertEqual(result.iban, "IE12BOFI90000112345") mock_extract.assert_called_once() @patch( @@ -69,12 +70,13 @@ def test_extract_from_pdf_with_forced_template(self, mock_extract): source_file=pdf_path, ) - rows, pages, iban = self.orchestrator.extract_from_pdf( + result = self.orchestrator.extract_from_pdf( pdf_path, forced_template=mock_template ) - self.assertEqual(len(rows), 1) - self.assertEqual(pages, 3) + self.assertIsInstance(result, ExtractionResult) + self.assertEqual(len(result.transactions), 1) + self.assertEqual(result.page_count, 3) # Verify extraction was called with the forced template call_args = mock_extract.call_args self.assertEqual(call_args[0][0], pdf_path) # First positional arg @@ -91,11 +93,12 @@ def test_extract_from_pdf_no_data(self, mock_extract): transactions=[], page_count=2, iban=None, source_file=pdf_path ) - rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path) + result = self.orchestrator.extract_from_pdf(pdf_path) - self.assertEqual(len(rows), 0) - self.assertEqual(pages, 2) - self.assertIsNone(iban) + self.assertIsInstance(result, ExtractionResult) + self.assertEqual(len(result.transactions), 0) + self.assertEqual(result.page_count, 2) + self.assertIsNone(result.iban) @patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" @@ -115,12 +118,13 @@ def test_extract_from_pdf_returns_rows_as_is(self, mock_extract): source_file=pdf_path, ) - rows, _, _ = self.orchestrator.extract_from_pdf(pdf_path) + result = self.orchestrator.extract_from_pdf(pdf_path) - # Rows should be returned as-is (Filename is added by caller, not orchestrator) - self.assertEqual(len(rows), 2) - self.assertEqual(rows[0]["Details"], "Test1") - self.assertEqual(rows[1]["Details"], "Test2") + self.assertIsInstance(result, ExtractionResult) + # transactions are Transaction objects; check via to_dict() for field values + self.assertEqual(len(result.transactions), 2) + self.assertEqual(result.transactions[0].to_dict()["Details"], "Test1") + self.assertEqual(result.transactions[1].to_dict()["Details"], "Test2") def test_initialization_without_errors(self): """Test orchestrator initializes without errors.""" From c9a7e92c608a878c8d0d4f6c81c0dcb8928ebd46 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 07:58:36 +0000 Subject: [PATCH 04/14] feat(23-02): wire extract_from_pdf() to return ExtractionResult directly - Add ExtractionResult import from bankstatements_core.domain - Change return annotation from tuple[list, int, str | None] to ExtractionResult - Remove tuple unpacking; pass ExtractionResult through unchanged - Use result.iban for IBAN logging - Update docstring Returns section --- .../services/extraction_orchestrator.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py index 4cd9b14..ba11d09 100644 --- a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py @@ -12,6 +12,7 @@ from bankstatements_core.domain.protocols.services import ITemplateDetector from bankstatements_core.config.processor_config import ExtractionConfig +from bankstatements_core.domain import ExtractionResult from bankstatements_core.entitlements import Entitlements from bankstatements_core.extraction.extraction_facade import extract_tables_from_pdf from bankstatements_core.templates import TemplateDetector, TemplateRegistry @@ -132,7 +133,7 @@ def extract_from_pdf( self, pdf_path: Path, forced_template: BankTemplate | None = None, - ) -> tuple[list, int, str | None]: + ) -> ExtractionResult: """Extract transactions from a single PDF file. Args: @@ -141,10 +142,8 @@ def extract_from_pdf( (overrides instance forced_template) Returns: - Tuple of (rows, page_count, iban): - - rows: List of transaction dictionaries - - page_count: Number of pages processed - - iban: Extracted IBAN or None + ExtractionResult containing extracted transactions, page count, IBAN, + and document-level warnings """ # Determine which template to use template = forced_template or self._forced_template @@ -158,7 +157,7 @@ def extract_from_pdf( logger.info(f"Using template: {template.name} for {pdf_path.name}") # Extract transactions using template - extraction_result = extract_tables_from_pdf( + result = extract_tables_from_pdf( pdf_path, self._config.table_top_y, self._config.table_bottom_y, @@ -166,17 +165,14 @@ def extract_from_pdf( self._config.enable_dynamic_boundary, template=template, ) - rows = [tx.to_dict() for tx in extraction_result.transactions] - page_count = extraction_result.page_count - iban = extraction_result.iban # Log IBAN if found - if iban: + if result.iban: logger.info( - f"IBAN extracted from {pdf_path.name}: {iban[:4]}****{iban[-4:]}" + f"IBAN extracted from {pdf_path.name}: {result.iban[:4]}****{result.iban[-4:]}" ) - return rows, page_count, iban + return result def _detect_template(self, pdf_path: Path) -> BankTemplate | None: """Detect template for a PDF file. From 3bfa787d20911bc347ba67694c81f1879e60f851 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 08:01:59 +0000 Subject: [PATCH 05/14] feat(23-02): wire process_all_pdfs() to return list[ExtractionResult] - Add ExtractionResult and transactions_to_dicts imports - Change return annotation from tuple[list[dict], int, dict[str,str]] to list[ExtractionResult] - Replace tuple unpacking with result.page_count/iban/transactions field access - Exclusion logic preserved using ExtractionResult fields - filter_service.apply_all_filters() receives transactions_to_dicts(result.transactions) - Append excluded results to list (page_count preserved for callers) - Update processor._process_all_pdfs() to adapt list[ExtractionResult] -> 3-tuple - Update test_processor test mock to use list[ExtractionResult] return value --- .../src/bankstatements_core/processor.py | 19 +++++++++- .../services/pdf_processing_orchestrator.py | 35 ++++++++++--------- packages/parser-core/tests/test_processor.py | 18 ++++++++-- 3 files changed, 51 insertions(+), 21 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 2480ebf..381d8c6 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -282,11 +282,28 @@ def _process_all_pdfs(self) -> tuple[list[dict], int, dict[str, str]]: Returns: Tuple of (all_rows, total_pages_read, pdf_ibans) """ + from bankstatements_core.domain.converters import transactions_to_dicts + # Delegate to PDF processing orchestrator with recursive_scan setting - return self._pdf_orchestrator.process_all_pdfs( + results = self._pdf_orchestrator.process_all_pdfs( self.input_dir, recursive=self.recursive_scan ) + # Adapt list[ExtractionResult] → (all_rows, pages_read, pdf_ibans) for caller + all_rows: list[dict] = [] + pages_read = 0 + pdf_ibans: dict[str, str] = {} + for result in results: + pages_read += result.page_count + # Skip excluded results (no IBAN, no transactions, but had pages) + if result.iban is None and len(result.transactions) == 0 and result.page_count > 0: + continue + if result.iban: + pdf_ibans[result.source_file.name] = result.iban + all_rows.extend(transactions_to_dicts(result.transactions)) + + return all_rows, pages_read, pdf_ibans + def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]: """ Sort transactions using the configured sorting strategy. diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py index e15f700..79cc0a0 100644 --- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py @@ -16,6 +16,8 @@ from typing import TYPE_CHECKING, Any from bankstatements_core.config.processor_config import ExtractionConfig +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import transactions_to_dicts from bankstatements_core.entitlements import Entitlements if TYPE_CHECKING: @@ -103,7 +105,7 @@ def __init__( def process_all_pdfs( self, input_dir: Path, recursive: bool = False - ) -> tuple[list[dict], int, dict[str, str]]: + ) -> list[ExtractionResult]: """Process all PDF files in the input directory. Args: @@ -111,18 +113,16 @@ def process_all_pdfs( recursive: Whether to search subdirectories recursively Returns: - Tuple of (all_transactions, total_pages_read, pdf_ibans) + list[ExtractionResult] — one ExtractionResult per successfully processed PDF Examples: # orchestrator = PDFProcessingOrchestrator(config, columns, output_dir) - # result = orchestrator.process_all_pdfs(Path("input")) - # transactions, pages, ibans = result - # Processed {len(transactions)} transactions + # results = orchestrator.process_all_pdfs(Path("input")) """ # Discover PDF files pdf_files = self.pdf_discovery.discover_pdfs(input_dir, recursive=recursive) - all_rows: list[dict] = [] + results: list[ExtractionResult] = [] pages_read = 0 pdf_ibans: dict[str, str] = {} excluded_files: list[dict[str, Any]] = [] @@ -132,13 +132,11 @@ def process_all_pdfs( logger.info("Processing PDF %d of %d", idx, len(pdf_files)) try: - rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf( - pdf - ) - pages_read += page_count + result = self.extraction_orchestrator.extract_from_pdf(pdf) + pages_read += result.page_count # Check if should be excluded (no IBAN and no data) - if iban is None and len(rows) == 0 and page_count > 0: + if result.iban is None and len(result.transactions) == 0 and result.page_count > 0: excluded_files.append( { "filename": pdf.name, @@ -148,7 +146,7 @@ def process_all_pdfs( "(likely credit card statement)" ), "timestamp": datetime.now().isoformat(), - "pages": page_count, + "pages": result.page_count, } ) logger.warning( @@ -156,21 +154,24 @@ def process_all_pdfs( idx, pdf.name, ) + results.append(result) continue # Store IBAN if found - if iban: - pdf_ibans[pdf.name] = iban + if result.iban: + pdf_ibans[pdf.name] = result.iban # Apply filters to extracted rows - filtered_rows = self.filter_service.apply_all_filters(rows) + filtered_rows = self.filter_service.apply_all_filters( + transactions_to_dicts(result.transactions) + ) - all_rows.extend(filtered_rows) logger.info( "Successfully processed PDF %d: %d transactions extracted", idx, len(filtered_rows), ) + results.append(result) except (OSError, ValueError, AttributeError, KeyError) as e: # Expected errors: file I/O, invalid PDF structure, missing attributes, missing keys @@ -190,7 +191,7 @@ def process_all_pdfs( if excluded_files: self._save_excluded_files(excluded_files) - return all_rows, pages_read, pdf_ibans + return results def _save_ibans(self, pdf_ibans: dict[str, str]) -> None: """Save extracted IBANs to JSON file. diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py index 6c22ec2..eb8d05e 100644 --- a/packages/parser-core/tests/test_processor.py +++ b/packages/parser-core/tests/test_processor.py @@ -222,9 +222,21 @@ def test_run_with_mock_data(self): with patch( "bankstatements_core.services.pdf_processing_orchestrator.PDFProcessingOrchestrator.process_all_pdfs" ) as mock_process: - # Configure mock to return tuple of (transactions, pages_read, ibans) - all_transactions = mock_data_pdf1 + mock_data_pdf2 - mock_process.return_value = (all_transactions, 2, {}) + # Configure mock to return list[ExtractionResult] + mock_process.return_value = [ + ExtractionResult( + transactions=dicts_to_transactions(mock_data_pdf1), + page_count=1, + iban=None, + source_file=pdf1, + ), + ExtractionResult( + transactions=dicts_to_transactions(mock_data_pdf2), + page_count=1, + iban=None, + source_file=pdf2, + ), + ] processor = create_test_processor(self.input_dir, self.output_dir) result = processor.run() From a089d0263c3ff28734e213a2c18353c03440a0a1 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 08:05:36 +0000 Subject: [PATCH 06/14] feat(23-03): wire processor.py to return list[ExtractionResult] - _process_all_pdfs() now returns list[ExtractionResult] directly (no adapter) - run() loops over results, aggregates all_rows/pages_read/pdf_ibans via transactions_to_dicts() - Added ExtractionResult and transactions_to_dicts imports at module level - test_processor_refactored_methods.py updated: tuple-unpack assertions replaced with indexed list access --- .../src/bankstatements_core/processor.py | 35 ++++++++----------- .../test_processor_refactored_methods.py | 20 +++++------ 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 381d8c6..5da8dca 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -30,6 +30,8 @@ TransactionProcessingOrchestrator, ) from bankstatements_core.utils import is_date_column, to_float # noqa: F401 +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import transactions_to_dicts logger = logging.getLogger(__name__) @@ -276,34 +278,17 @@ def _detect_duplicates(self, all_rows: list[dict]) -> tuple[list[dict], list[dic """ return self._duplicate_service.detect_and_separate(all_rows) - def _process_all_pdfs(self) -> tuple[list[dict], int, dict[str, str]]: + def _process_all_pdfs(self) -> list[ExtractionResult]: """Process all PDF files in the input directory and extract transaction data. Returns: - Tuple of (all_rows, total_pages_read, pdf_ibans) + list[ExtractionResult] — one per processed PDF """ - from bankstatements_core.domain.converters import transactions_to_dicts - # Delegate to PDF processing orchestrator with recursive_scan setting - results = self._pdf_orchestrator.process_all_pdfs( + return self._pdf_orchestrator.process_all_pdfs( self.input_dir, recursive=self.recursive_scan ) - # Adapt list[ExtractionResult] → (all_rows, pages_read, pdf_ibans) for caller - all_rows: list[dict] = [] - pages_read = 0 - pdf_ibans: dict[str, str] = {} - for result in results: - pages_read += result.page_count - # Skip excluded results (no IBAN, no transactions, but had pages) - if result.iban is None and len(result.transactions) == 0 and result.page_count > 0: - continue - if result.iban: - pdf_ibans[result.source_file.name] = result.iban - all_rows.extend(transactions_to_dicts(result.transactions)) - - return all_rows, pages_read, pdf_ibans - def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]: """ Sort transactions using the configured sorting strategy. @@ -325,7 +310,15 @@ def run(self) -> dict: start_time = datetime.now() # Step 1: Extract transactions from all PDFs (delegated to orchestrator) - all_rows, pages_read, pdf_ibans = self._process_all_pdfs() + extraction_results = self._process_all_pdfs() # list[ExtractionResult] + all_rows: list[dict] = [] + pages_read = 0 + pdf_ibans: dict[str, str] = {} + for result in extraction_results: + pages_read += result.page_count + if result.iban: + pdf_ibans[result.source_file.name] = result.iban + all_rows.extend(transactions_to_dicts(result.transactions)) pdf_count = len(sorted(self.input_dir.glob("*.pdf"))) # Step 2: Group transactions by IBAN (delegated to orchestrator) diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py index 8e89eca..92ff97d 100644 --- a/packages/parser-core/tests/test_processor_refactored_methods.py +++ b/packages/parser-core/tests/test_processor_refactored_methods.py @@ -87,14 +87,16 @@ def test_process_all_pdfs_multiple_files(self, mock_extract): ), ] - all_rows, pages_read, pdf_ibans = processor._process_all_pdfs() + results = processor._process_all_pdfs() # Verify results - self.assertEqual(len(all_rows), 2) - self.assertEqual(pages_read, 8) # 5 + 3 - self.assertEqual(all_rows[0]["Details"], "Test1") - self.assertEqual(all_rows[1]["Details"], "Test2") - self.assertIsInstance(pdf_ibans, dict) + self.assertEqual(len(results), 2) + self.assertEqual(results[0].page_count, 5) + self.assertEqual(results[1].page_count, 3) + self.assertEqual(results[0].transactions[0].to_dict()["Details"], "Test1") + self.assertEqual(results[1].transactions[0].to_dict()["Details"], "Test2") + self.assertIsNone(results[0].iban) + self.assertIsNone(results[1].iban) @patch( "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" @@ -114,12 +116,10 @@ def test_process_all_pdfs_no_files(self, mock_extract): with patch.object(Path, "glob") as mock_glob: mock_glob.return_value = [] - all_rows, pages_read, pdf_ibans = processor._process_all_pdfs() + results = processor._process_all_pdfs() # Should return empty results - self.assertEqual(len(all_rows), 0) - self.assertEqual(pages_read, 0) - self.assertEqual(len(pdf_ibans), 0) + self.assertEqual(len(results), 0) mock_extract.assert_not_called() def test_sort_transactions_by_date_mixed_dates(self): From 0334de62fce11430042fa36efc96e7f051ddcea0 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 09:59:31 +0000 Subject: [PATCH 07/14] test(24-01): add failing tests for word_utils pure functions - group_words_by_y: 8 tests for Y-grouping with tolerance param - assign_words_to_columns: 11 tests for strict/relaxed x-position assignment - calculate_column_coverage: 7 tests for coverage fraction calculation --- .../tests/extraction/test_word_utils.py | 274 ++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 packages/parser-core/tests/extraction/test_word_utils.py diff --git a/packages/parser-core/tests/extraction/test_word_utils.py b/packages/parser-core/tests/extraction/test_word_utils.py new file mode 100644 index 0000000..b92efe8 --- /dev/null +++ b/packages/parser-core/tests/extraction/test_word_utils.py @@ -0,0 +1,274 @@ +"""TDD tests for extraction/word_utils.py. + +Written BEFORE implementation (RED phase). All tests must fail with ImportError +until word_utils.py is created. +""" + +from __future__ import annotations + +import pytest + +from bankstatements_core.extraction.word_utils import ( + assign_words_to_columns, + calculate_column_coverage, + group_words_by_y, +) + +# --------------------------------------------------------------------------- +# Shared fixtures / helpers +# --------------------------------------------------------------------------- + +COLUMNS = { + "Date": (0, 50), + "Details": (50, 200), + "Debit": (200, 250), + "Credit": (250, 300), + "Balance": (300, 350), +} + + +def _word(text: str, x0: float, top: float, x1: float | None = None) -> dict: + w: dict = {"text": text, "x0": x0, "top": top} + if x1 is not None: + w["x1"] = x1 + return w + + +# --------------------------------------------------------------------------- +# TestGroupWordsByY +# --------------------------------------------------------------------------- + + +class TestGroupWordsByY: + def test_basic_grouping_two_distinct_y_values(self): + """Words with different top values produce separate groups.""" + words = [ + _word("A", 10, 350.0), + _word("B", 60, 370.0), + ] + result = group_words_by_y(words) + assert 350.0 in result + assert 370.0 in result + assert len(result[350.0]) == 1 + assert result[350.0][0]["text"] == "A" + assert len(result[370.0]) == 1 + assert result[370.0][0]["text"] == "B" + + def test_fractional_tops_round_to_integer_key(self): + """Words with top=100.3 and top=100.7 both round to 100.0 (one group).""" + words = [ + _word("A", 10, 100.3), + _word("B", 60, 100.7), + ] + result = group_words_by_y(words) + assert len(result) == 1 + assert 100.0 in result + assert len(result[100.0]) == 2 + + def test_tolerance_parameter_accepted(self): + """tolerance parameter is accepted without raising TypeError.""" + words = [_word("A", 10, 200.0)] + # Should not raise even though tolerance does not change behaviour + result = group_words_by_y(words, tolerance=5.0) + assert 200.0 in result + + def test_tolerance_does_not_change_grouping(self): + """Grouping is by round(top, 0) regardless of tolerance value.""" + words = [ + _word("A", 10, 100.3), + _word("B", 60, 100.7), + ] + result_default = group_words_by_y(words) + result_large_tolerance = group_words_by_y(words, tolerance=50.0) + assert set(result_default.keys()) == set(result_large_tolerance.keys()) + + def test_no_table_top_y_filtering(self): + """Words with any top value are included -- no table_top_y minimum.""" + words = [ + _word("A", 10, 0.0), + _word("B", 60, 5.0), + _word("C", 60, 999.0), + ] + result = group_words_by_y(words) + assert len(result) == 3 + + def test_empty_list_returns_empty_dict(self): + """Empty word list produces an empty dict.""" + result = group_words_by_y([]) + assert result == {} + + def test_multiple_words_same_y_grouped_together(self): + """Multiple words at the same rounded Y appear in one group.""" + words = [ + _word("01/01", 5, 320.0), + _word("Tesco", 60, 320.0), + _word("12.50", 210, 320.0), + ] + result = group_words_by_y(words) + assert len(result) == 1 + assert len(result[320.0]) == 3 + + def test_returned_dict_keys_are_float(self): + """All keys in the returned dict are floats (not ints).""" + words = [_word("A", 10, 200.0)] + result = group_words_by_y(words) + for key in result: + assert isinstance(key, float), f"Expected float key, got {type(key)}" + + +# --------------------------------------------------------------------------- +# TestAssignWordsToColumns +# --------------------------------------------------------------------------- + + +class TestAssignWordsToColumns: + def test_relaxed_check_by_default(self): + """With strict_rightmost=False (default), all columns use xmin <= x0 < xmax.""" + words = [_word("Hello", 60, 320.0, x1=90)] + result = assign_words_to_columns(words, COLUMNS) + assert result["Details"] == "Hello" + + def test_all_columns_have_empty_string_keys(self): + """Returned dict contains all column names, even those without data.""" + result = assign_words_to_columns([], COLUMNS) + assert set(result.keys()) == set(COLUMNS.keys()) + for val in result.values(): + assert val == "" + + def test_empty_words_returns_empty_row(self): + """Empty words list returns dict with all column keys mapped to empty string.""" + result = assign_words_to_columns([], COLUMNS) + assert result == {col: "" for col in COLUMNS} + + def test_strict_rightmost_false_accepts_word_extending_beyond_rightmost(self): + """Without strict_rightmost, a word whose x1 extends past xmax is still placed.""" + # x0=310 is inside Balance (300-350), x1=360 extends beyond xmax=350 + # relaxed check: xmin(300) <= x0(310) < xmax(350) → placed + words = [_word("1234.56", 310, 320.0, x1=360)] + result = assign_words_to_columns(words, COLUMNS) + assert result["Balance"] == "1234.56" + + def test_strict_rightmost_true_rejects_word_extending_beyond_rightmost(self): + """With strict_rightmost=True, a word whose x1 > xmax of rightmost is rejected.""" + # x0=310 inside Balance (300-350), x1=360 extends beyond xmax=350 + # strict check: xmin(300) <= x0(310) AND x1(360) <= xmax(350) → FAILS + words = [_word("1234.56", 310, 320.0, x1=360)] + result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True) + assert result["Balance"] == "" + + def test_strict_rightmost_true_accepts_word_contained_in_rightmost(self): + """With strict_rightmost=True, a word fully within rightmost column is accepted.""" + # x0=310, x1=345 both within Balance (300-350) + words = [_word("100.00", 310, 320.0, x1=345)] + result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True) + assert result["Balance"] == "100.00" + + def test_strict_rightmost_does_not_affect_non_rightmost_columns(self): + """With strict_rightmost=True, non-rightmost columns still use relaxed check.""" + # Details word x0=60 x1=999 (way beyond Details xmax=200) + # relaxed check: xmin(50) <= x0(60) < xmax(200) → placed (x1 ignored) + words = [_word("Tesco", 60, 320.0, x1=999)] + result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True) + assert result["Details"] == "Tesco" + + def test_x1_fallback_when_key_missing(self): + """When x1 key is absent, fallback x1 = x0 + max(len(text)*3, 10) is used.""" + # "Hi" has len=2; fallback: x0(60) + max(2*3, 10) = 60 + 10 = 70 + # With strict_rightmost=True, Balance (300-350): x0=310, x1=320 (both inside) → placed + words = [{"text": "Hi", "x0": 310, "top": 320.0}] # no x1 key + result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True) + # fallback x1 = 310 + max(len("Hi")*3, 10) = 310 + 10 = 320 <= 350 → accepted + assert result["Balance"] == "Hi" + + def test_x1_fallback_strict_rightmost_rejects_with_long_text(self): + """Long text without x1 causes large fallback x1, which fails strict check.""" + # "LongTextHere" has len=12; fallback: x0(310) + max(12*3, 10) = 310 + 36 = 346 <= 350 + # Actually 346 <= 350 passes. Use a very long text. + # "AReallylongtextstring" has len=21; fallback: 310 + max(21*3, 10) = 310 + 63 = 373 > 350 → rejected + long_text = "AReallylongtextstring" # len=21 + words = [{"text": long_text, "x0": 310, "top": 320.0}] # no x1 + result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True) + assert result["Balance"] == "" + + def test_multiple_words_concatenated_with_space(self): + """Multiple words in the same column are concatenated with space, then stripped.""" + words = [ + _word("John", 60, 320.0, x1=90), + _word("Smith", 95, 320.0, x1=130), + ] + result = assign_words_to_columns(words, COLUMNS) + assert result["Details"] == "John Smith" + + def test_values_are_stripped(self): + """Final values have leading/trailing whitespace stripped.""" + words = [_word("Hi", 60, 320.0, x1=90)] + result = assign_words_to_columns(words, COLUMNS) + assert result["Details"] == "Hi" + assert not result["Details"].endswith(" ") + + def test_word_outside_all_columns_not_assigned(self): + """A word whose x0 falls outside every column boundary is not placed anywhere.""" + # x0=500 is beyond all column boundaries (max xmax=350) + words = [_word("Orphan", 500, 320.0, x1=530)] + result = assign_words_to_columns(words, COLUMNS) + for val in result.values(): + assert val == "" + + +# --------------------------------------------------------------------------- +# TestCalculateColumnCoverage +# --------------------------------------------------------------------------- + + +class TestCalculateColumnCoverage: + def test_all_columns_have_data_returns_one(self): + """When every column has data in at least one row, coverage is 1.0.""" + rows = [ + {"Date": "01/01", "Details": "Tesco", "Debit": "12.50", "Credit": "", "Balance": "100.00"}, + {"Date": "", "Details": "AIB", "Debit": "", "Credit": "50.00", "Balance": ""}, + ] + result = calculate_column_coverage(rows, COLUMNS) + assert result == 1.0 + + def test_partial_coverage(self): + """2 of 4 columns having data returns 0.5.""" + two_col_columns = {"Date": (0, 50), "Details": (50, 200), "Debit": (200, 250), "Credit": (250, 300)} + rows = [ + {"Date": "01/01", "Details": "Tesco", "Debit": "", "Credit": ""}, + ] + result = calculate_column_coverage(rows, two_col_columns) + assert result == 0.5 + + def test_empty_rows_returns_zero(self): + """No rows means no coverage.""" + result = calculate_column_coverage([], COLUMNS) + assert result == 0.0 + + def test_empty_columns_returns_zero(self): + """No columns means no coverage.""" + rows = [{"Date": "01/01"}] + result = calculate_column_coverage(rows, {}) + assert result == 0.0 + + def test_whitespace_only_values_not_counted(self): + """Values that are only whitespace are not counted as data.""" + rows = [ + {"Date": " ", "Details": "\t", "Debit": "", "Credit": "", "Balance": ""}, + ] + result = calculate_column_coverage(rows, COLUMNS) + assert result == 0.0 + + def test_single_row_single_column_has_data(self): + """1 of 5 columns having data in one row returns 0.2.""" + rows = [{"Date": "", "Details": "Tesco", "Debit": "", "Credit": "", "Balance": ""}] + result = calculate_column_coverage(rows, COLUMNS) + assert result == pytest.approx(0.2) + + def test_same_column_in_multiple_rows_counts_once(self): + """The same column name is counted once even if it has data in multiple rows.""" + rows = [ + {"Date": "01/01", "Details": "", "Debit": "", "Credit": "", "Balance": ""}, + {"Date": "02/01", "Details": "", "Debit": "", "Credit": "", "Balance": ""}, + ] + result = calculate_column_coverage(rows, COLUMNS) + assert result == pytest.approx(0.2) From aecb9d34de49ef37074ce204eac2595235e1ba71 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:03:30 +0000 Subject: [PATCH 08/14] feat(24-01): implement word_utils.py with three pure standalone functions - group_words_by_y: groups words by round(top, 0); tolerance param accepted for API compat - assign_words_to_columns: relaxed/strict rightmost column boundary check - calculate_column_coverage: returns fraction of columns with non-empty data across rows - Fix test rounding: round(100.7, 0) is 101.0 in Python banker rounding --- .../extraction/word_utils.py | 128 ++++++++++++++++++ .../tests/extraction/test_word_utils.py | 11 +- 2 files changed, 136 insertions(+), 3 deletions(-) create mode 100644 packages/parser-core/src/bankstatements_core/extraction/word_utils.py diff --git a/packages/parser-core/src/bankstatements_core/extraction/word_utils.py b/packages/parser-core/src/bankstatements_core/extraction/word_utils.py new file mode 100644 index 0000000..95cbff0 --- /dev/null +++ b/packages/parser-core/src/bankstatements_core/extraction/word_utils.py @@ -0,0 +1,128 @@ +"""Pure standalone word-processing utility functions for PDF table extraction. + +These functions consolidate duplicated private-method logic from: +- extraction/boundary_detector.py (_group_words_by_y, _build_row_from_words, + _calculate_column_coverage) +- extraction/row_builder.py (inline word-grouping and column-assignment loop) +- services/header_detection.py (_group_words_by_y_coordinate) +- services/page_validation.py (calculate_column_coverage -- canonical version) + +This module is intentionally NOT re-exported from extraction/__init__.py. +Callers import directly: + from bankstatements_core.extraction.word_utils import group_words_by_y +""" + +from __future__ import annotations + + +def group_words_by_y( + words: list[dict], + tolerance: float = 1.0, +) -> dict[float, list[dict]]: + """Group words by Y-coordinate (rounded to the nearest integer). + + Args: + words: List of word dicts, each containing at least ``"top"`` and + ``"text"`` keys. + tolerance: Accepted for API compatibility but the current implementation + always rounds to 0 decimal places (``round(w["top"], 0)``), + which is the behaviour used by all original callers. Changing + tolerance to a non-zero value does **not** affect the grouping. + + Returns: + A dict mapping ``float`` Y-coordinate keys to the list of word dicts + at that rounded Y position. Keys are the result of + ``round(w["top"], 0)``, which always produces a ``float`` in Python 3. + + Note: + This function applies **no** ``table_top_y`` filtering. Callers that + need to exclude words above the table boundary should pre-filter the + word list before calling this function. + """ + lines: dict[float, list[dict]] = {} + for w in words: + y_key = round(w["top"], 0) + lines.setdefault(y_key, []).append(w) + return lines + + +def assign_words_to_columns( + words: list[dict], + columns: dict[str, tuple[int | float, int | float]], + strict_rightmost: bool = False, +) -> dict[str, str]: + """Assign words to named columns based on their x-position. + + Args: + words: List of word dicts at a single Y-coordinate. Each dict must + contain ``"text"`` and ``"x0"`` keys. ``"x1"`` is optional; if + absent the right edge is estimated as + ``x0 + max(len(text) * 3, 10)``. + columns: Ordered mapping of column name to ``(xmin, xmax)`` boundary + tuple. The last entry is treated as the *rightmost* column when + ``strict_rightmost=True``. + strict_rightmost: When ``True``, the rightmost column uses a strict + containment check (``xmin <= x0 and x1 <= xmax``) to prevent + footer or overflow text from bleeding into balance/amount columns. + When ``False`` (default), all columns use a relaxed left-edge + check (``xmin <= x0 < xmax``). + + Returns: + A dict mapping every column name to its accumulated text value, + stripped of leading/trailing whitespace. Columns with no matching + words map to an empty string. + """ + column_names = list(columns.keys()) + rightmost = column_names[-1] if column_names else None + row: dict[str, str] = dict.fromkeys(columns, "") + + for w in words: + x0: float = w["x0"] + text: str = w["text"] + x1: float = w.get("x1", x0 + max(len(text) * 3, 10)) + + for col, (xmin, xmax) in columns.items(): + if strict_rightmost and col == rightmost: + if xmin <= x0 and x1 <= xmax: + row[col] += text + " " + break + else: + if xmin <= x0 < xmax: + row[col] += text + " " + break + + return {k: v.strip() for k, v in row.items()} + + +def calculate_column_coverage( + rows: list[dict], + columns: dict[str, tuple[int | float, int | float]], +) -> float: + """Return the fraction of columns that have non-empty data across all rows. + + A column is counted as having data if at least one row contains a + non-empty, non-whitespace string for that column name. + + Args: + rows: List of row dicts, each mapping column names to string values. + columns: Column definitions mapping name to ``(xmin, xmax)``. Used + to determine the full set of column names and the denominator for + the coverage fraction. + + Returns: + A float in ``[0.0, 1.0]``. Returns ``0.0`` when ``rows`` is empty + or ``columns`` is empty. + + Note: + Canonical source: ``PageValidationService.calculate_column_coverage`` + in ``services/page_validation.py`` (L116–L133). + """ + if not rows or not columns: + return 0.0 + column_names = list(columns.keys()) + columns_with_data: set[str] = set() + for row in rows: + for col_name in column_names: + if row.get(col_name, "").strip(): + columns_with_data.add(col_name) + return len(columns_with_data) / len(column_names) diff --git a/packages/parser-core/tests/extraction/test_word_utils.py b/packages/parser-core/tests/extraction/test_word_utils.py index b92efe8..79f14a4 100644 --- a/packages/parser-core/tests/extraction/test_word_utils.py +++ b/packages/parser-core/tests/extraction/test_word_utils.py @@ -55,10 +55,15 @@ def test_basic_grouping_two_distinct_y_values(self): assert result[370.0][0]["text"] == "B" def test_fractional_tops_round_to_integer_key(self): - """Words with top=100.3 and top=100.7 both round to 100.0 (one group).""" + """Words with top values that round to the same integer produce one group. + + round(100.3, 0) == 100.0 and round(100.4, 0) == 100.0 → one group. + Note: round(100.7, 0) == 101.0 (banker's rounding), producing a separate + group. The implementation uses Python's built-in round(top, 0). + """ words = [ _word("A", 10, 100.3), - _word("B", 60, 100.7), + _word("B", 60, 100.4), ] result = group_words_by_y(words) assert len(result) == 1 @@ -76,7 +81,7 @@ def test_tolerance_does_not_change_grouping(self): """Grouping is by round(top, 0) regardless of tolerance value.""" words = [ _word("A", 10, 100.3), - _word("B", 60, 100.7), + _word("B", 60, 100.4), ] result_default = group_words_by_y(words) result_large_tolerance = group_words_by_y(words, tolerance=50.0) From 69046c051f31f6871601ef5da357069a3500d2e2 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:12:53 +0000 Subject: [PATCH 09/14] feat(24-02): migrate all callers to word_utils, delete private originals - boundary_detector.py: import group_words_by_y/assign_words_to_columns/calculate_column_coverage; add pre-filter before group_words_by_y; delete _group_words_by_y, _build_row_from_words, _calculate_column_coverage - row_builder.py: import assign_words_to_columns/group_words_by_y; replace inline loops with word_utils calls; remove _rightmost_column/_column_names attrs - header_detection.py: import group_words_by_y; delete _group_words_by_y_coordinate private method - page_validation.py: import calculate_column_coverage as _calculate_column_coverage_impl; make calculate_column_coverage a thin wrapper - content_density.py: import assign_words_to_columns; replace inline column-assignment loop --- .../extraction/boundary_detector.py | 80 +++---------------- .../extraction/row_builder.py | 31 +------ .../services/content_density.py | 16 +--- .../services/header_detection.py | 28 +------ .../services/page_validation.py | 18 ++--- 5 files changed, 28 insertions(+), 145 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py b/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py index 997cc63..7c4a8ef 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py +++ b/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py @@ -13,6 +13,11 @@ RowClassifier, create_row_classifier_chain, ) +from bankstatements_core.extraction.word_utils import ( + assign_words_to_columns, + calculate_column_coverage, + group_words_by_y, +) logger = logging.getLogger(__name__) @@ -97,7 +102,8 @@ def detect_boundary(self, words: list[dict]) -> int: return self.fallback_bottom_y # Phase 0: Group words and find transaction positions - lines = self._group_words_by_y(words) + filtered_words = [w for w in words if w["top"] >= self.table_top_y] + lines = group_words_by_y(filtered_words) if not lines: return self.fallback_bottom_y @@ -156,23 +162,6 @@ def detect_boundary(self, words: list[dict]) -> int: logger.debug("No clear table end detected - using fallback boundary") return self.fallback_bottom_y - def _group_words_by_y(self, words: list[dict]) -> dict[float, list[dict]]: - """ - Group words by Y-coordinate (rounded). - - Args: - words: List of word dictionaries with 'top', 'x0', 'text' keys - - Returns: - Dictionary mapping Y-coordinate to list of words at that Y - """ - lines: dict[float, list[dict]] = {} - for w in words: - if w["top"] >= self.table_top_y: - y_key = round(w["top"], 0) - lines.setdefault(y_key, []).append(w) - return lines - def _find_transaction_positions( self, lines: dict[float, list[dict]], sorted_y_coords: list[float] ) -> tuple[list[float], float | None]: @@ -190,7 +179,7 @@ def _find_transaction_positions( last_transaction_y = None for y_coord in sorted_y_coords: - row = self._build_row_from_words(lines[y_coord]) + row = assign_words_to_columns(lines[y_coord], self.columns) if any(row.values()): row_type = self._row_classifier.classify(row, self.columns) @@ -278,7 +267,7 @@ def _detect_by_spatial_gaps( post_gap_transactions = 0 for y_coord in post_gap_y_coords: - row = self._build_row_from_words(lines[y_coord]) + row = assign_words_to_columns(lines[y_coord], self.columns) if ( any(row.values()) @@ -320,11 +309,11 @@ def _detect_by_structure_breakdown( if last_transaction_y is not None and y_coord <= last_transaction_y: continue - row = self._build_row_from_words(lines[y_coord]) + row = assign_words_to_columns(lines[y_coord], self.columns) if any(row.values()): # Check if this row has any structure (data in expected columns) - column_coverage = self._calculate_column_coverage([row]) + column_coverage = calculate_column_coverage([row], self.columns) if column_coverage < 0.3: # Less than 30% of columns have data structure_breakdown_count += 1 else: @@ -369,7 +358,7 @@ def _detect_by_consecutive_non_transactions( if last_transaction_y is not None and y_coord <= last_transaction_y: continue - row = self._build_row_from_words(lines[y_coord]) + row = assign_words_to_columns(lines[y_coord], self.columns) if any(row.values()): row_type = self._row_classifier.classify(row, self.columns) @@ -391,48 +380,3 @@ def _detect_by_consecutive_non_transactions( ) return None - - def _build_row_from_words(self, words: list[dict]) -> dict[str, str]: - """ - Build a row dictionary from words by assigning to columns. - - Args: - words: List of words at the same Y-coordinate - - Returns: - Dictionary mapping column names to concatenated text - """ - row = dict.fromkeys(self.columns, "") - - for w in words: - x0 = w["x0"] - text = w["text"] - for col, (xmin, xmax) in self.columns.items(): - if xmin <= x0 < xmax: - row[col] += text + " " - break - - return {k: v.strip() for k, v in row.items()} - - def _calculate_column_coverage(self, rows: list[dict[str, str]]) -> float: - """ - Calculate what percentage of columns have data in the given rows. - - Args: - rows: List of row dictionaries - - Returns: - Float between 0.0 and 1.0 representing column coverage - """ - if not rows: - return 0.0 - - total_columns = len(self.columns) - columns_with_data = set() - - for row in rows: - for col_name, value in row.items(): - if value and value.strip(): - columns_with_data.add(col_name) - - return len(columns_with_data) / total_columns if total_columns > 0 else 0.0 diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py index 6034a50..47fb246 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py +++ b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py @@ -9,6 +9,8 @@ import logging from typing import TYPE_CHECKING +from bankstatements_core.extraction.word_utils import assign_words_to_columns, group_words_by_y + if TYPE_CHECKING: from bankstatements_core.extraction.row_classifiers import RowClassifier @@ -31,8 +33,6 @@ def __init__( ) -> None: self._columns = columns self._row_classifier = row_classifier - self._column_names = list(columns.keys()) - self._rightmost_column = self._column_names[-1] if self._column_names else None def build_rows(self, words: list[dict]) -> list[dict]: """Group words by Y position, assign to columns, return transaction/continuation rows. @@ -43,35 +43,12 @@ def build_rows(self, words: list[dict]) -> list[dict]: Returns: List of row dictionaries classified as 'transaction' or 'continuation' """ - lines: dict[float, list[dict]] = {} - for w in words: - y_key = round(w["top"], 0) - lines.setdefault(y_key, []).append(w) - + lines = group_words_by_y(words) page_rows = [] for _, line_words in sorted(lines.items()): - row = dict.fromkeys(self._columns, "") - - for w in line_words: - x0 = w["x0"] - x1 = w.get("x1", x0 + max(len(w["text"]) * 3, 10)) - text = w["text"] - - for col, (xmin, xmax) in self._columns.items(): - if col == self._rightmost_column: - if xmin <= x0 and x1 <= xmax: - row[col] += text + " " - break - else: - if xmin <= x0 < xmax: - row[col] += text + " " - break - - row = {k: v.strip() for k, v in row.items()} - + row = assign_words_to_columns(line_words, self._columns, strict_rightmost=True) if any(row.values()): row_type = self._row_classifier.classify(row, self._columns) if row_type in ["transaction", "continuation"]: page_rows.append(row) - return page_rows diff --git a/packages/parser-core/src/bankstatements_core/services/content_density.py b/packages/parser-core/src/bankstatements_core/services/content_density.py index 59ea329..f0d1afb 100644 --- a/packages/parser-core/src/bankstatements_core/services/content_density.py +++ b/packages/parser-core/src/bankstatements_core/services/content_density.py @@ -8,6 +8,8 @@ import logging +from bankstatements_core.extraction.word_utils import assign_words_to_columns + logger = logging.getLogger(__name__) @@ -67,19 +69,7 @@ def analyze_content_density( for y_coord in window_y_coords: # Form row from words at this Y coordinate - row = dict.fromkeys(columns, "") - - for w in word_groups[y_coord]: - x0 = w["x0"] - text = w["text"] - - for col, (xmin, xmax) in columns.items(): - if xmin <= x0 < xmax: - row[col] += text + " " - break - - # Normalize row data - row = {k: v.strip() for k, v in row.items()} + row = assign_words_to_columns(word_groups[y_coord], columns) if any(row.values()): # Only count non-empty rows total_rows += 1 diff --git a/packages/parser-core/src/bankstatements_core/services/header_detection.py b/packages/parser-core/src/bankstatements_core/services/header_detection.py index 988e782..d47c13b 100644 --- a/packages/parser-core/src/bankstatements_core/services/header_detection.py +++ b/packages/parser-core/src/bankstatements_core/services/header_detection.py @@ -10,6 +10,8 @@ import logging from typing import TYPE_CHECKING +from bankstatements_core.extraction.word_utils import group_words_by_y + if TYPE_CHECKING: import pdfplumber @@ -131,7 +133,7 @@ def detect_headers( return False # Group words by Y coordinate to find potential header rows - rows_by_y = self._group_words_by_y_coordinate(words) + rows_by_y = group_words_by_y(words) # Check top few rows for header-like content top_rows = sorted(rows_by_y.keys())[:max_rows_to_check] @@ -162,30 +164,6 @@ def detect_headers( ) return False - def _group_words_by_y_coordinate( - self, words: list[dict], tolerance: float = 1.0 - ) -> dict[float, list[dict]]: - """Group words by their Y coordinate (row position). - - Words with Y coordinates within `tolerance` pixels are grouped together - as being on the same row. Uses rounding to group words efficiently. - - Args: - words: List of word dictionaries - tolerance: Y coordinate tolerance for grouping (pixels) - - Returns: - Dictionary mapping Y coordinates to lists of words - """ - rows_by_y: dict[float, list[dict]] = {} - - for word in words: - # Round Y coordinate to nearest integer for grouping - y_key = round(word.get("top", 0), 0) - rows_by_y.setdefault(y_key, []).append(word) - - return rows_by_y - def check_page_for_headers( self, page: pdfplumber.page.Page, diff --git a/packages/parser-core/src/bankstatements_core/services/page_validation.py b/packages/parser-core/src/bankstatements_core/services/page_validation.py index 5347886..db60457 100644 --- a/packages/parser-core/src/bankstatements_core/services/page_validation.py +++ b/packages/parser-core/src/bankstatements_core/services/page_validation.py @@ -9,6 +9,9 @@ import logging from bankstatements_core.domain.column_types import get_type_as_string +from bankstatements_core.extraction.word_utils import ( + calculate_column_coverage as _calculate_column_coverage_impl, +) logger = logging.getLogger(__name__) @@ -118,6 +121,8 @@ def calculate_column_coverage( ) -> float: """Calculate what percentage of columns have meaningful data. + Delegates to word_utils.calculate_column_coverage for the canonical implementation. + Args: rows: List of extracted rows columns: Column definitions @@ -132,18 +137,7 @@ def calculate_column_coverage( >>> service.calculate_column_coverage(rows, columns) 0.666... """ - if not rows or not columns: - return 0.0 - - column_names = list(columns.keys()) - columns_with_data = set() - - for row in rows: - for col_name in column_names: - if row.get(col_name, "").strip(): - columns_with_data.add(col_name) - - return len(columns_with_data) / len(column_names) + return _calculate_column_coverage_impl(rows, columns) def has_column_type( self, From feed87a80d0f591cf25a090f51dc84bb3c257440 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:14:31 +0000 Subject: [PATCH 10/14] refactor(24-02): remove private-method test calls, covered by test_word_utils - test_boundary_detector.py: delete test_group_words_by_y, test_group_words_filters_above_table_top, test_build_row_from_words, test_calculate_column_coverage_full, test_calculate_column_coverage_partial, test_calculate_column_coverage_empty - test_header_detection.py: delete test_group_words_by_y_coordinate - All deleted tests now covered by test_word_utils.py TestGroupWordsByY, TestAssignWordsToColumns, TestCalculateColumnCoverage --- .../extraction/test_boundary_detector.py | 88 ------------------- .../tests/services/test_header_detection.py | 17 ---- 2 files changed, 105 deletions(-) diff --git a/packages/parser-core/tests/extraction/test_boundary_detector.py b/packages/parser-core/tests/extraction/test_boundary_detector.py index d848f2a..27cbb90 100644 --- a/packages/parser-core/tests/extraction/test_boundary_detector.py +++ b/packages/parser-core/tests/extraction/test_boundary_detector.py @@ -61,36 +61,6 @@ def test_no_transactions_returns_fallback(self): boundary = detector.detect_boundary(words) assert boundary == 720 - def test_group_words_by_y(self): - """Test grouping words by Y-coordinate.""" - detector = TableBoundaryDetector( - columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300 - ) - words = [ - {"text": "Word1", "x0": 30, "top": 350}, - {"text": "Word2", "x0": 60, "top": 350}, - {"text": "Word3", "x0": 30, "top": 370}, - ] - lines = detector._group_words_by_y(words) - assert len(lines) == 2 - assert 350.0 in lines - assert 370.0 in lines - assert len(lines[350.0]) == 2 - assert len(lines[370.0]) == 1 - - def test_group_words_filters_above_table_top(self): - """Test that words above table_top_y are filtered out.""" - detector = TableBoundaryDetector( - columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300 - ) - words = [ - {"text": "Above", "x0": 30, "top": 250}, # Above table_top_y - {"text": "Below", "x0": 30, "top": 350}, # Below table_top_y - ] - lines = detector._group_words_by_y(words) - assert len(lines) == 1 - assert 350.0 in lines - def test_find_transaction_positions(self): """Test finding transaction positions.""" detector = TableBoundaryDetector( @@ -220,64 +190,6 @@ def test_detect_by_spatial_gaps_requires_no_transactions_after(self): # Should return None because there's a transaction after the gap assert result is None - def test_build_row_from_words(self): - """Test building a row from words.""" - detector = TableBoundaryDetector( - columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300 - ) - words = [ - {"text": "01", "x0": 30, "top": 350}, - {"text": "Jan", "x0": 35, "top": 350}, - {"text": "Purchase", "x0": 60, "top": 350}, - {"text": "50.00", "x0": 210, "top": 350}, - ] - row = detector._build_row_from_words(words) - assert "01 Jan" in row["Date"] - assert "Purchase" in row["Details"] - assert "50.00" in row["Debit €"] - - def test_calculate_column_coverage_full(self): - """Test column coverage calculation with full coverage.""" - detector = TableBoundaryDetector( - columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300 - ) - rows = [ - { - "Date": "01 Jan", - "Details": "Purchase", - "Debit €": "50.00", - "Credit €": "", - "Balance €": "100.00", - } - ] - coverage = detector._calculate_column_coverage(rows) - assert coverage == 0.8 # 4 out of 5 columns (Credit is empty) - - def test_calculate_column_coverage_partial(self): - """Test column coverage calculation with partial coverage.""" - detector = TableBoundaryDetector( - columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300 - ) - rows = [ - { - "Date": "", - "Details": "Text", - "Debit €": "", - "Credit €": "", - "Balance €": "", - } - ] - coverage = detector._calculate_column_coverage(rows) - assert coverage == 0.2 # 1 out of 5 columns - - def test_calculate_column_coverage_empty(self): - """Test column coverage calculation with empty rows.""" - detector = TableBoundaryDetector( - columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300 - ) - coverage = detector._calculate_column_coverage([]) - assert coverage == 0.0 - def test_detect_by_structure_breakdown(self): """Test detection by column structure breakdown.""" detector = TableBoundaryDetector( diff --git a/packages/parser-core/tests/services/test_header_detection.py b/packages/parser-core/tests/services/test_header_detection.py index 0ba81d7..7d32b59 100644 --- a/packages/parser-core/tests/services/test_header_detection.py +++ b/packages/parser-core/tests/services/test_header_detection.py @@ -153,23 +153,6 @@ def test_detect_headers_custom_min_keywords(self, service, sample_columns): result = service.detect_headers(words, sample_columns, min_keywords=1) assert result is True - def test_group_words_by_y_coordinate(self, service): - """Test word grouping by Y coordinate.""" - words = [ - {"text": "A", "top": 100.0}, - {"text": "B", "top": 100.5}, # Close to 100, should group - {"text": "C", "top": 105.0}, - {"text": "D", "top": 105.2}, # Close to 105, should group - ] - grouped = service._group_words_by_y_coordinate(words) - - # Should have 2 groups (rounded to 100 and 105) - assert len(grouped) == 2 - assert 100.0 in grouped - assert 105.0 in grouped - assert len(grouped[100.0]) == 2 # A and B - assert len(grouped[105.0]) == 2 # C and D - def test_header_keywords_coverage(self, service): """Test that service has comprehensive header keywords.""" # Verify key banking terms are included From 3307b34141169e404215fe5ebb95f61fb48c3d3d Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:51:42 +0000 Subject: [PATCH 11/14] style: apply black formatting to parser-core --- .../extraction/row_builder.py | 9 ++++- .../services/pdf_processing_orchestrator.py | 6 ++- .../templates/template_detector.py | 11 +++-- .../tests/extraction/test_word_utils.py | 27 +++++++++++-- .../services/test_extraction_orchestrator.py | 14 ++++--- .../tests/test_credit_card_detection.py | 4 +- .../tests/test_excluded_files_logging.py | 40 +++++++++++++------ .../tests/test_pdf_table_extractor.py | 16 +++++--- packages/parser-core/tests/test_processor.py | 29 ++++++++++---- .../test_processor_refactored_methods.py | 34 +++++++++++----- .../test_transaction_data_no_blocking.py | 26 +++++++----- 11 files changed, 153 insertions(+), 63 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py index 47fb246..c1f59fd 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py +++ b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py @@ -9,7 +9,10 @@ import logging from typing import TYPE_CHECKING -from bankstatements_core.extraction.word_utils import assign_words_to_columns, group_words_by_y +from bankstatements_core.extraction.word_utils import ( + assign_words_to_columns, + group_words_by_y, +) if TYPE_CHECKING: from bankstatements_core.extraction.row_classifiers import RowClassifier @@ -46,7 +49,9 @@ def build_rows(self, words: list[dict]) -> list[dict]: lines = group_words_by_y(words) page_rows = [] for _, line_words in sorted(lines.items()): - row = assign_words_to_columns(line_words, self._columns, strict_rightmost=True) + row = assign_words_to_columns( + line_words, self._columns, strict_rightmost=True + ) if any(row.values()): row_type = self._row_classifier.classify(row, self._columns) if row_type in ["transaction", "continuation"]: diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py index 79cc0a0..95f8a9a 100644 --- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py @@ -136,7 +136,11 @@ def process_all_pdfs( pages_read += result.page_count # Check if should be excluded (no IBAN and no data) - if result.iban is None and len(result.transactions) == 0 and result.page_count > 0: + if ( + result.iban is None + and len(result.transactions) == 0 + and result.page_count > 0 + ): excluded_files.append( { "filename": pdf.name, diff --git a/packages/parser-core/src/bankstatements_core/templates/template_detector.py b/packages/parser-core/src/bankstatements_core/templates/template_detector.py index 6791451..9c481a7 100644 --- a/packages/parser-core/src/bankstatements_core/templates/template_detector.py +++ b/packages/parser-core/src/bankstatements_core/templates/template_detector.py @@ -117,7 +117,9 @@ class DetectionExplanation: class TemplateDetector: """Orchestrates multi-signal template detection using confidence-based scoring.""" - def __init__(self, registry: TemplateRegistry, scoring: ScoringConfig | None = None): + def __init__( + self, registry: TemplateRegistry, scoring: ScoringConfig | None = None + ): """Initialize detector with template registry. Args: @@ -542,7 +544,9 @@ def get_detection_explanation( if tie_broken: # Determine reason without re-running full tie-break has_iban = { - tid: any(d.detector_name == "IBAN" for d in template_details.get(tid, [])) + tid: any( + d.detector_name == "IBAN" for d in template_details.get(tid, []) + ) for tid in tied_templates } if any(has_iban.values()): @@ -550,7 +554,8 @@ def get_detection_explanation( else: max_confs = { tid: max( - (d.confidence for d in template_details.get(tid, [])), default=0.0 + (d.confidence for d in template_details.get(tid, [])), + default=0.0, ) for tid in tied_templates } diff --git a/packages/parser-core/tests/extraction/test_word_utils.py b/packages/parser-core/tests/extraction/test_word_utils.py index 79f14a4..f404ea4 100644 --- a/packages/parser-core/tests/extraction/test_word_utils.py +++ b/packages/parser-core/tests/extraction/test_word_utils.py @@ -229,15 +229,32 @@ class TestCalculateColumnCoverage: def test_all_columns_have_data_returns_one(self): """When every column has data in at least one row, coverage is 1.0.""" rows = [ - {"Date": "01/01", "Details": "Tesco", "Debit": "12.50", "Credit": "", "Balance": "100.00"}, - {"Date": "", "Details": "AIB", "Debit": "", "Credit": "50.00", "Balance": ""}, + { + "Date": "01/01", + "Details": "Tesco", + "Debit": "12.50", + "Credit": "", + "Balance": "100.00", + }, + { + "Date": "", + "Details": "AIB", + "Debit": "", + "Credit": "50.00", + "Balance": "", + }, ] result = calculate_column_coverage(rows, COLUMNS) assert result == 1.0 def test_partial_coverage(self): """2 of 4 columns having data returns 0.5.""" - two_col_columns = {"Date": (0, 50), "Details": (50, 200), "Debit": (200, 250), "Credit": (250, 300)} + two_col_columns = { + "Date": (0, 50), + "Details": (50, 200), + "Debit": (200, 250), + "Credit": (250, 300), + } rows = [ {"Date": "01/01", "Details": "Tesco", "Debit": "", "Credit": ""}, ] @@ -265,7 +282,9 @@ def test_whitespace_only_values_not_counted(self): def test_single_row_single_column_has_data(self): """1 of 5 columns having data in one row returns 0.2.""" - rows = [{"Date": "", "Details": "Tesco", "Debit": "", "Credit": "", "Balance": ""}] + rows = [ + {"Date": "", "Details": "Tesco", "Debit": "", "Credit": "", "Balance": ""} + ] result = calculate_column_coverage(rows, COLUMNS) assert result == pytest.approx(0.2) diff --git a/packages/parser-core/tests/services/test_extraction_orchestrator.py b/packages/parser-core/tests/services/test_extraction_orchestrator.py index 49d4c84..9f91a9b 100644 --- a/packages/parser-core/tests/services/test_extraction_orchestrator.py +++ b/packages/parser-core/tests/services/test_extraction_orchestrator.py @@ -37,7 +37,9 @@ def test_extract_from_pdf_success(self, mock_extract): # Mock successful extraction mock_extract.return_value = ExtractionResult( - transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test"}]), + transactions=dicts_to_transactions( + [{"Date": "01/01/23", "Details": "Test"}] + ), page_count=5, iban="IE12BOFI90000112345", source_file=pdf_path, @@ -109,10 +111,12 @@ def test_extract_from_pdf_returns_rows_as_is(self, mock_extract): pdf_path.write_text("fake pdf") mock_extract.return_value = ExtractionResult( - transactions=dicts_to_transactions([ - {"Date": "01/01/23", "Details": "Test1"}, - {"Date": "02/01/23", "Details": "Test2"}, - ]), + transactions=dicts_to_transactions( + [ + {"Date": "01/01/23", "Details": "Test1"}, + {"Date": "02/01/23", "Details": "Test2"}, + ] + ), page_count=3, iban=None, source_file=pdf_path, diff --git a/packages/parser-core/tests/test_credit_card_detection.py b/packages/parser-core/tests/test_credit_card_detection.py index 85935a6..76e3791 100644 --- a/packages/parser-core/tests/test_credit_card_detection.py +++ b/packages/parser-core/tests/test_credit_card_detection.py @@ -175,7 +175,9 @@ def test_all_credit_card_patterns(self, mock_pdfplumber): extractor = PDFTableExtractor(columns=TEST_COLUMNS) result = extractor.extract(Path("/tmp/test.pdf")) - assert len(result.transactions) == 0, f"Failed to detect with pattern: {pattern_name}" + assert ( + len(result.transactions) == 0 + ), f"Failed to detect with pattern: {pattern_name}" @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open") def test_does_not_detect_false_positives(self, mock_pdfplumber): diff --git a/packages/parser-core/tests/test_excluded_files_logging.py b/packages/parser-core/tests/test_excluded_files_logging.py index 38492df..d59b70a 100644 --- a/packages/parser-core/tests/test_excluded_files_logging.py +++ b/packages/parser-core/tests/test_excluded_files_logging.py @@ -43,7 +43,9 @@ def test_excluded_files_json_created(self): "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: mock_extract.return_value = ExtractionResult( - transactions=[], page_count=1, iban=None, + transactions=[], + page_count=1, + iban=None, source_file=Path("credit_card.pdf"), warnings=["credit card statement detected, skipped"], ) @@ -84,9 +86,9 @@ def test_excluded_files_json_not_created_when_no_exclusions(self): "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: mock_extract.return_value = ExtractionResult( - transactions=dicts_to_transactions([ - {"Date": "01/12/23", "Details": "Test", "Debit €": "100"} - ]), + transactions=dicts_to_transactions( + [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}] + ), page_count=1, iban="IE29AIBK93115212345678", source_file=Path("bank_statement.pdf"), @@ -117,7 +119,9 @@ def test_excluded_files_multiple_exclusions(self): "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: mock_extract.return_value = ExtractionResult( - transactions=[], page_count=2, iban=None, + transactions=[], + page_count=2, + iban=None, source_file=Path("credit_card.pdf"), warnings=["credit card statement detected, skipped"], ) @@ -160,15 +164,17 @@ def test_excluded_files_mixed_scenarios(self): def mock_extract_side_effect(pdf_path, *args, **kwargs): if "credit_card" in str(pdf_path): return ExtractionResult( - transactions=[], page_count=1, iban=None, + transactions=[], + page_count=1, + iban=None, source_file=pdf_path, warnings=["credit card statement detected, skipped"], ) else: return ExtractionResult( - transactions=dicts_to_transactions([ - {"Date": "01/12/23", "Details": "Test", "Debit €": "100"} - ]), + transactions=dicts_to_transactions( + [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}] + ), page_count=1, iban="IE29AIBK93115212345678", source_file=pdf_path, @@ -208,7 +214,9 @@ def test_excluded_files_json_structure(self): "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: mock_extract.return_value = ExtractionResult( - transactions=[], page_count=3, iban=None, + transactions=[], + page_count=3, + iban=None, source_file=Path("test.pdf"), warnings=["credit card statement detected, skipped"], ) @@ -327,7 +335,9 @@ def test_pdf_with_iban_not_excluded(self): "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: mock_extract.return_value = ExtractionResult( - transactions=[], page_count=2, iban="IE29AIBK93115212345678", + transactions=[], + page_count=2, + iban="IE29AIBK93115212345678", source_file=Path("bank_statement.pdf"), ) @@ -361,13 +371,17 @@ def mock_extract_side_effect(pdf_path, *args, **kwargs): if "bank_with_iban" in str(pdf_path): # Has IBAN, empty rows (should NOT be excluded) return ExtractionResult( - transactions=[], page_count=2, iban="IE29AIBK93115212345678", + transactions=[], + page_count=2, + iban="IE29AIBK93115212345678", source_file=pdf_path, ) else: # No IBAN, empty rows (SHOULD be excluded) return ExtractionResult( - transactions=[], page_count=1, iban=None, + transactions=[], + page_count=1, + iban=None, source_file=pdf_path, warnings=["credit card statement detected, skipped"], ) diff --git a/packages/parser-core/tests/test_pdf_table_extractor.py b/packages/parser-core/tests/test_pdf_table_extractor.py index d515393..1d2bec0 100644 --- a/packages/parser-core/tests/test_pdf_table_extractor.py +++ b/packages/parser-core/tests/test_pdf_table_extractor.py @@ -84,7 +84,9 @@ def test_extract_tables_with_date_propagation(self, mock_pdfplumber): self.assertEqual(result.transactions[0].filename, "test.pdf") # Check second transaction (should inherit date from first) - self.assertEqual(result.transactions[1].date.strip(), "01 Jan 2024") # Inherited! + self.assertEqual( + result.transactions[1].date.strip(), "01 Jan 2024" + ) # Inherited! self.assertEqual(result.transactions[1].details.strip(), "Coffee Shop") self.assertEqual(result.transactions[1].debit.strip(), "5.50") @@ -416,7 +418,9 @@ def crop_side_effect(*args): # Should have fewer rows (administrative content filtered) self.assertEqual(len(dynamic_result.transactions), 1) - self.assertEqual(dynamic_result.transactions[0].details.strip(), "VDC-STORE") + self.assertEqual( + dynamic_result.transactions[0].details.strip(), "VDC-STORE" + ) run_test() @@ -638,9 +642,7 @@ def test_extract_tables_with_page_validation_enabled(self, mock_pdfplumber): test_pdf_path = Path("/tmp/test.pdf") # Test with page validation enabled - result = extract_tables_from_pdf( - test_pdf_path, enable_page_validation=True - ) + result = extract_tables_from_pdf(test_pdf_path, enable_page_validation=True) # Should have processed only the valid page self.assertEqual(result.page_count, 2) # Total pages processed @@ -657,7 +659,9 @@ def test_extract_tables_with_page_validation_enabled(self, mock_pdfplumber): ) # Should process both pages (more rows due to invalid page content) - self.assertGreaterEqual(len(result_no_val.transactions), len(result.transactions)) + self.assertGreaterEqual( + len(result_no_val.transactions), len(result.transactions) + ) def test_analyze_content_density_empty_word_groups(self): """Test analyze_content_density with empty word groups.""" diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py index eb8d05e..1b43ca0 100644 --- a/packages/parser-core/tests/test_processor.py +++ b/packages/parser-core/tests/test_processor.py @@ -289,13 +289,21 @@ def test_processor_continues_after_pdf_error(self, mock_extract): # Mock: first PDF succeeds, second fails, third succeeds mock_extract.side_effect = [ ExtractionResult( - transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]), - page_count=1, iban=None, source_file=Path("good.pdf"), + transactions=dicts_to_transactions( + [{"Date": "01 Jan 2024", "Details": "Transaction 1"}] + ), + page_count=1, + iban=None, + source_file=Path("good.pdf"), ), OSError("Failed to open PDF"), ExtractionResult( - transactions=dicts_to_transactions([{"Date": "02 Jan 2024", "Details": "Transaction 2"}]), - page_count=1, iban=None, source_file=Path("good2.pdf"), + transactions=dicts_to_transactions( + [{"Date": "02 Jan 2024", "Details": "Transaction 2"}] + ), + page_count=1, + iban=None, + source_file=Path("good2.pdf"), ), ] @@ -319,12 +327,19 @@ def test_processor_handles_empty_pdf_gracefully(self, mock_extract): # Mock: first PDF returns no rows, second has data mock_extract.side_effect = [ ExtractionResult( - transactions=[], page_count=1, iban=None, source_file=Path("empty.pdf"), + transactions=[], + page_count=1, + iban=None, + source_file=Path("empty.pdf"), warnings=["credit card statement detected, skipped"], ), ExtractionResult( - transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]), - page_count=1, iban=None, source_file=Path("with_data.pdf"), + transactions=dicts_to_transactions( + [{"Date": "01 Jan 2024", "Details": "Transaction 1"}] + ), + page_count=1, + iban=None, + source_file=Path("with_data.pdf"), ), ] diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py index 92ff97d..72d24b9 100644 --- a/packages/parser-core/tests/test_processor_refactored_methods.py +++ b/packages/parser-core/tests/test_processor_refactored_methods.py @@ -78,12 +78,20 @@ def test_process_all_pdfs_multiple_files(self, mock_extract): # Mock extract_tables_from_pdf to return different data mock_extract.side_effect = [ ExtractionResult( - transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test1"}]), - page_count=5, iban=None, source_file=Path("file1.pdf"), + transactions=dicts_to_transactions( + [{"Date": "01/01/23", "Details": "Test1"}] + ), + page_count=5, + iban=None, + source_file=Path("file1.pdf"), ), ExtractionResult( - transactions=dicts_to_transactions([{"Date": "02/01/23", "Details": "Test2"}]), - page_count=3, iban=None, source_file=Path("file2.pdf"), + transactions=dicts_to_transactions( + [{"Date": "02/01/23", "Details": "Test2"}] + ), + page_count=3, + iban=None, + source_file=Path("file2.pdf"), ), ] @@ -93,8 +101,12 @@ def test_process_all_pdfs_multiple_files(self, mock_extract): self.assertEqual(len(results), 2) self.assertEqual(results[0].page_count, 5) self.assertEqual(results[1].page_count, 3) - self.assertEqual(results[0].transactions[0].to_dict()["Details"], "Test1") - self.assertEqual(results[1].transactions[0].to_dict()["Details"], "Test2") + self.assertEqual( + results[0].transactions[0].to_dict()["Details"], "Test1" + ) + self.assertEqual( + results[1].transactions[0].to_dict()["Details"], "Test2" + ) self.assertIsNone(results[0].iban) self.assertIsNone(results[1].iban) @@ -332,10 +344,12 @@ def test_run_uses_refactored_methods(self, mock_extract): # Mock extract_tables_from_pdf mock_extract.return_value = ExtractionResult( - transactions=dicts_to_transactions([ - {"Date": "15/06/23", "Details": "Second", "Debit €": "200"}, - {"Date": "01/01/23", "Details": "First", "Debit €": "100"}, - ]), + transactions=dicts_to_transactions( + [ + {"Date": "15/06/23", "Details": "Second", "Debit €": "200"}, + {"Date": "01/01/23", "Details": "First", "Debit €": "100"}, + ] + ), page_count=2, iban=None, source_file=Path("test.pdf"), diff --git a/packages/parser-core/tests/test_transaction_data_no_blocking.py b/packages/parser-core/tests/test_transaction_data_no_blocking.py index ae2596f..5a062e0 100644 --- a/packages/parser-core/tests/test_transaction_data_no_blocking.py +++ b/packages/parser-core/tests/test_transaction_data_no_blocking.py @@ -201,13 +201,15 @@ def test_processor_does_not_use_transaction_data_for_exclusion(self): ) as mock_extract: # Returns: ExtractionResult with VISA in transactions, IBAN from header mock_extract.return_value = ExtractionResult( - transactions=dicts_to_transactions([ - { - "Date": "11 Aug", - "Details": "MOBI CLICK VISA", - "Debit €": "100.00", - } - ]), + transactions=dicts_to_transactions( + [ + { + "Date": "11 Aug", + "Details": "MOBI CLICK VISA", + "Debit €": "100.00", + } + ] + ), page_count=1, iban="IE48AIBK93408921459015", source_file=Path("test.pdf"), @@ -246,10 +248,12 @@ def test_all_card_brands_in_transactions_allowed(self): "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf" ) as mock_extract: mock_extract.return_value = ExtractionResult( - transactions=dicts_to_transactions([ - {"Date": "11 Aug", "Details": detail} - for detail in transaction_details - ]), + transactions=dicts_to_transactions( + [ + {"Date": "11 Aug", "Details": detail} + for detail in transaction_details + ] + ), page_count=1, iban="IE48AIBK93408921459015", source_file=Path("test.pdf"), From d9ade9b8de68999659f21dbf65f22d8e681ff5b1 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:53:30 +0000 Subject: [PATCH 12/14] style: fix isort import ordering in parser-core --- .../src/bankstatements_core/extraction/extraction_facade.py | 2 +- .../src/bankstatements_core/extraction/pdf_extractor.py | 4 ++-- packages/parser-core/src/bankstatements_core/processor.py | 4 ++-- packages/parser-core/tests/test_pdf_table_extractor.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py index 7fdc3d6..7c4f9c0 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py +++ b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py @@ -11,8 +11,8 @@ from typing import TYPE_CHECKING from bankstatements_core.config.column_config import DEFAULT_COLUMNS -from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y from bankstatements_core.domain import ExtractionResult +from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y if TYPE_CHECKING: from bankstatements_core.extraction.row_classifiers import RowClassifier diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index 549e6c2..b1648a2 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -13,6 +13,8 @@ if TYPE_CHECKING: from bankstatements_core.domain.protocols.pdf_reader import IPDFReader +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import dicts_to_transactions from bankstatements_core.extraction.iban_extractor import IBANExtractor from bankstatements_core.extraction.page_header_analyser import PageHeaderAnalyser from bankstatements_core.extraction.row_builder import RowBuilder @@ -22,8 +24,6 @@ StatefulPageRowProcessor, extract_filename_date, ) -from bankstatements_core.domain import ExtractionResult -from bankstatements_core.domain.converters import dicts_to_transactions logger = logging.getLogger(__name__) diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 5da8dca..1d410d5 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -11,6 +11,8 @@ from bankstatements_core.config.column_config import DEFAULT_COLUMNS, get_column_names from bankstatements_core.config.processor_config import ProcessorConfig from bankstatements_core.config.totals_config import parse_totals_columns # noqa: F401 +from bankstatements_core.domain import ExtractionResult +from bankstatements_core.domain.converters import transactions_to_dicts from bankstatements_core.services.column_analysis import ColumnAnalysisService from bankstatements_core.services.date_parser import DateParserService from bankstatements_core.services.duplicate_detector import DuplicateDetectionService @@ -30,8 +32,6 @@ TransactionProcessingOrchestrator, ) from bankstatements_core.utils import is_date_column, to_float # noqa: F401 -from bankstatements_core.domain import ExtractionResult -from bankstatements_core.domain.converters import transactions_to_dicts logger = logging.getLogger(__name__) diff --git a/packages/parser-core/tests/test_pdf_table_extractor.py b/packages/parser-core/tests/test_pdf_table_extractor.py index 1d2bec0..500fa69 100644 --- a/packages/parser-core/tests/test_pdf_table_extractor.py +++ b/packages/parser-core/tests/test_pdf_table_extractor.py @@ -6,9 +6,9 @@ from pathlib import Path from unittest.mock import MagicMock, Mock, patch -from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier from bankstatements_core.domain import ExtractionResult from bankstatements_core.domain.converters import dicts_to_transactions +from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier from bankstatements_core.pdf_table_extractor import ( DEFAULT_COLUMNS, analyze_content_density, From 4d609703faad7754a0cb35d01f9888c693b60eb8 Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:56:57 +0000 Subject: [PATCH 13/14] fix: remove unused imports and variables flagged by flake8 --- .../src/bankstatements_core/templates/template_detector.py | 2 +- packages/parser-core/tests/extraction/test_pdf_extractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/templates/template_detector.py b/packages/parser-core/src/bankstatements_core/templates/template_detector.py index 9c481a7..7ca1ba2 100644 --- a/packages/parser-core/src/bankstatements_core/templates/template_detector.py +++ b/packages/parser-core/src/bankstatements_core/templates/template_detector.py @@ -5,7 +5,7 @@ import logging import re from collections import defaultdict -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path from pdfplumber.page import Page diff --git a/packages/parser-core/tests/extraction/test_pdf_extractor.py b/packages/parser-core/tests/extraction/test_pdf_extractor.py index 5fa3e2b..a0424f7 100644 --- a/packages/parser-core/tests/extraction/test_pdf_extractor.py +++ b/packages/parser-core/tests/extraction/test_pdf_extractor.py @@ -538,7 +538,7 @@ def test_per_page_header_check_top_y_override(self, mock_pdfplumber): enable_header_check=True, # Enable header check ) - result = extractor.extract(Path("/tmp/test.pdf")) + extractor.extract(Path("/tmp/test.pdf")) # Verify header area crop was called with override value (450) header_crop_calls = [ From 60df9ad55e2280ecc5398e5e5306100d46a6770e Mon Sep 17 00:00:00 2001 From: longieirl Date: Wed, 25 Mar 2026 10:59:46 +0000 Subject: [PATCH 14/14] fix: rename loop var to avoid ExtractionResult/dict type collision in mypy --- .../parser-core/src/bankstatements_core/processor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 1d410d5..e049574 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -314,11 +314,11 @@ def run(self) -> dict: all_rows: list[dict] = [] pages_read = 0 pdf_ibans: dict[str, str] = {} - for result in extraction_results: - pages_read += result.page_count - if result.iban: - pdf_ibans[result.source_file.name] = result.iban - all_rows.extend(transactions_to_dicts(result.transactions)) + for extraction in extraction_results: + pages_read += extraction.page_count + if extraction.iban: + pdf_ibans[extraction.source_file.name] = extraction.iban + all_rows.extend(transactions_to_dicts(extraction.transactions)) pdf_count = len(sorted(self.input_dir.glob("*.pdf"))) # Step 2: Group transactions by IBAN (delegated to orchestrator)