From 85ac5efc3f4eadc05506f63d920b4ccad2284de3 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Tue, 24 Mar 2026 23:09:24 +0000
Subject: [PATCH 01/14] feat(#32): make ScoringConfig injectable in
 TemplateDetector

- Replace global DETECTOR_WEIGHTS / MIN_CONFIDENCE_THRESHOLD refs with
  self._scoring.weight_for() / self._scoring.min_confidence_threshold
- Extract _run_scoring() private helper (scoring loop) from detect_template()
- Add get_detection_explanation() -> DetectionExplanation for diagnostic introspection
- Export ScoringConfig and DetectionExplanation from templates/__init__.py
- Add 4 new tests: threshold boundary pass/fail, weight ordering, tie-break reason
---
 .../bankstatements_core/templates/__init__.py |   8 +-
 .../templates/template_detector.py            | 335 +++++++++++++++---
 .../tests/templates/test_template_detector.py | 270 +++++++++++++-
 3 files changed, 554 insertions(+), 59 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/templates/__init__.py b/packages/parser-core/src/bankstatements_core/templates/__init__.py
index 97db2ed..f50f9d0 100644
--- a/packages/parser-core/src/bankstatements_core/templates/__init__.py
+++ b/packages/parser-core/src/bankstatements_core/templates/__init__.py
@@ -2,7 +2,11 @@
 
 from __future__ import annotations
 
-from bankstatements_core.templates.template_detector import TemplateDetector
+from bankstatements_core.templates.template_detector import (
+    DetectionExplanation,
+    ScoringConfig,
+    TemplateDetector,
+)
 from bankstatements_core.templates.template_model import (
     BankTemplate,
     TemplateDetectionConfig,
@@ -13,6 +17,8 @@
 
 __all__ = [
     "BankTemplate",
+    "DetectionExplanation",
+    "ScoringConfig",
     "TemplateDetectionConfig",
     "TemplateExtractionConfig",
     "TemplateProcessingConfig",
diff --git a/packages/parser-core/src/bankstatements_core/templates/template_detector.py b/packages/parser-core/src/bankstatements_core/templates/template_detector.py
index 4f4bf29..6791451 100644
--- a/packages/parser-core/src/bankstatements_core/templates/template_detector.py
+++ b/packages/parser-core/src/bankstatements_core/templates/template_detector.py
@@ -5,6 +5,7 @@
 import logging
 import re
 from collections import defaultdict
+from dataclasses import dataclass, field
 from pathlib import Path
 
 from pdfplumber.page import Page
@@ -25,35 +26,107 @@
 
 logger = logging.getLogger(__name__)
 
-# Detector weights for Phase 2 aggregated scoring
-# Higher weight = more reliable/specific detector
-DETECTOR_WEIGHTS = {
-    "IBAN": 2.0,  # Most reliable for bank statements (IBAN is unique)
-    "CardNumber": 2.0,  # NEW: Most reliable for credit card statements
-    "LoanReference": 2.0,  # NEW: Most reliable for loan statements
-    "ColumnHeader": 1.5,  # Strong indicator of template structure
-    "Header": 1.0,  # Baseline weight
-    "Filename": 0.8,  # Less reliable (users can rename files)
-    "Exclusion": 0.0,  # Binary filter, not used in scoring
-}
-
-# Minimum aggregate confidence required to select a template
-# Prevents weak matches from being selected
-# Threshold of 0.6 allows single detector matches (e.g., Filename @ 0.85 * 0.8 = 0.68)
-# while rejecting very weak matches (< 0.6)
-MIN_CONFIDENCE_THRESHOLD = 0.6
+
+@dataclass(frozen=True)
+class ScoringConfig:
+    """Injectable scoring policy for TemplateDetector.
+
+    Controls detector weights and the minimum aggregate confidence required
+    to select a template. Use ScoringConfig.default() for production behaviour.
+
+    Attributes:
+        weights: Mapping of detector name to score multiplier. Unknown detector
+                 names fall back to 1.0.
+        min_confidence_threshold: Minimum weighted aggregate score a template
+                                  must reach to be selected (exclusive lower bound
+                                  vs default; >= threshold passes).
+    """
+
+    weights: dict[str, float]
+    min_confidence_threshold: float
+
+    def __post_init__(self) -> None:
+        if not 0.0 < self.min_confidence_threshold <= 1.0:
+            raise ValueError(
+                f"min_confidence_threshold must be in (0.0, 1.0], "
+                f"got {self.min_confidence_threshold}"
+            )
+        for name, w in self.weights.items():
+            if w < 0.0:
+                raise ValueError(
+                    f"Weight for detector '{name}' must be >= 0.0, got {w}"
+                )
+
+    @classmethod
+    def default(cls) -> "ScoringConfig":
+        """Production scoring — used when no config is injected."""
+        return cls(
+            weights={
+                "IBAN": 2.0,
+                "CardNumber": 2.0,
+                "LoanReference": 2.0,
+                "ColumnHeader": 1.5,
+                "Header": 1.0,
+                "Filename": 0.8,
+                "Exclusion": 0.0,
+            },
+            min_confidence_threshold=0.6,
+        )
+
+    def weight_for(self, detector_name: str) -> float:
+        """Return the weight for a detector name, defaulting to 1.0 for unknowns."""
+        return self.weights.get(detector_name, 1.0)
+
+
+@dataclass
+class DetectionExplanation:
+    """Structured account of why a template was (or was not) selected.
+
+    Returned by TemplateDetector.get_detection_explanation(). Useful for
+    debugging mis-detections and writing tests that assert on scoring
+    outcomes without mocking individual detectors.
+
+    Attributes:
+        selected_template_id: ID of the selected template, or None if default used.
+        selected_score: Weighted aggregate score of the selected template (0.0 if
+                        default fallback was used).
+        threshold: The min_confidence_threshold that was applied.
+        passed_threshold: True if selected_score >= threshold.
+        per_template_scores: Weighted aggregate score for every candidate template.
+        per_template_breakdown: Human-readable per-detector contribution strings,
+                                e.g. {"aib": ["IBAN=0.95*2.0=1.90", ...]}.
+        tie_broken: True if multiple templates shared the top score.
+        tie_winner_reason: One of "IBAN match", "max confidence", "alphabetical",
+                           or None if no tie occurred.
+        used_default: True when the returned template is the registry default.
+        default_reason: Why the default was used, or None if a real match was found.
+    """
+
+    selected_template_id: str | None
+    selected_score: float
+    threshold: float
+    passed_threshold: bool
+    per_template_scores: dict[str, float]
+    per_template_breakdown: dict[str, list[str]]
+    tie_broken: bool
+    tie_winner_reason: str | None
+    used_default: bool
+    default_reason: str | None
 
 
 class TemplateDetector:
     """Orchestrates multi-signal template detection using confidence-based scoring."""
 
-    def __init__(self, registry: TemplateRegistry):
+    def __init__(self, registry: TemplateRegistry, scoring: ScoringConfig | None = None):
         """Initialize detector with template registry.
 
         Args:
             registry: TemplateRegistry containing available templates
+            scoring: Optional scoring config (weights + threshold). Defaults to
+                     ScoringConfig.default() — production weights, threshold 0.6.
         """
         self.registry = registry
+        self._scoring = scoring if scoring is not None else ScoringConfig.default()
 
         # Initialize detector chain
         # ExclusionDetector runs FIRST to filter out excluded templates
@@ -193,43 +266,9 @@ def detect_template(self, pdf_path: Path, first_page: Page) -> BankTemplate:
             logger.warning("No enabled templates found, using default")
             return self.registry.get_default_template()
 
-        # Phase 2: Aggregate scores from all detectors
-        template_scores: dict[str, float] = defaultdict(float)
-        template_details: dict[str, list[DetectionResult]] = defaultdict(list)
-        excluded_templates: set[str] = set()
-
-        for detector in self.detectors:
-            try:
-                results = detector.detect(pdf_path, first_page, templates)
-                if results:
-                    for result in results:
-                        # Skip excluded templates (confidence = 0.0)
-                        if result.confidence == 0.0:
-                            logger.debug(
-                                f"Template '{result.template.id}' excluded by "
-                                f"{result.detector_name}"
-                            )
-                            # Mark as excluded
-                            excluded_templates.add(result.template.id)
-                            continue
-
-                        # Skip templates that were already excluded
-                        if result.template.id in excluded_templates:
-                            continue
-
-                        # Apply detector-specific weight
-                        weight = DETECTOR_WEIGHTS.get(result.detector_name, 1.0)
-                        weighted_score = result.confidence * weight
-
-                        template_scores[result.template.id] += weighted_score
-                        template_details[result.template.id].append(result)
-
-            except (AttributeError, ValueError, TypeError, OSError) as e:
-                # Expected errors: PDF access issues, text extraction failures, type errors
-                logger.error(
-                    f"Error in {detector.name} detector for {pdf_path.name}: {e}"
-                )
-            # Let unexpected errors bubble up
+        template_scores, template_details, excluded_templates = self._run_scoring(
+            pdf_path, first_page, templates
+        )
 
         # Remove excluded templates
         valid_scores = {
@@ -272,10 +311,10 @@ def detect_template(self, pdf_path: Path, first_page: Page) -> BankTemplate:
         best_score = valid_scores[best_template_id]
 
         # Phase 2: Check minimum confidence threshold
-        if best_score < MIN_CONFIDENCE_THRESHOLD:
+        if best_score < self._scoring.min_confidence_threshold:
             logger.warning(
                 f"Best match '{best_template_id}' has confidence {best_score:.2f}, "
-                f"below threshold {MIN_CONFIDENCE_THRESHOLD}. Using default template."
+                f"below threshold {self._scoring.min_confidence_threshold}. Using default template."
             )
             if document_type:
                 return self.registry.get_default_for_type(document_type)
@@ -308,6 +347,54 @@ def detect_template(self, pdf_path: Path, first_page: Page) -> BankTemplate:
         )
         return best_template
 
+    def _run_scoring(
+        self,
+        pdf_path: Path,
+        first_page: Page,
+        templates: list[BankTemplate],
+    ) -> tuple[dict[str, float], dict[str, list[DetectionResult]], set[str]]:
+        """Run all detectors and aggregate weighted scores per template.
+
+        Args:
+            pdf_path: Path to the PDF file.
+            first_page: First page of the PDF.
+            templates: Candidate templates to score.
+
+        Returns:
+            Tuple of (template_scores, template_details, excluded_templates).
+        """
+        template_scores: dict[str, float] = defaultdict(float)
+        template_details: dict[str, list[DetectionResult]] = defaultdict(list)
+        excluded_templates: set[str] = set()
+
+        for detector in self.detectors:
+            try:
+                results = detector.detect(pdf_path, first_page, templates)
+                if results:
+                    for result in results:
+                        if result.confidence == 0.0:
+                            logger.debug(
+                                f"Template '{result.template.id}' excluded by "
+                                f"{result.detector_name}"
+                            )
+                            excluded_templates.add(result.template.id)
+                            continue
+
+                        if result.template.id in excluded_templates:
+                            continue
+
+                        weight = self._scoring.weight_for(result.detector_name)
+                        weighted_score = result.confidence * weight
+                        template_scores[result.template.id] += weighted_score
+                        template_details[result.template.id].append(result)
+
+            except (AttributeError, ValueError, TypeError, OSError) as e:
+                logger.error(
+                    f"Error in {detector.name} detector for {pdf_path.name}: {e}"
+                )
+
+        return dict(template_scores), dict(template_details), excluded_templates
+
     def _break_tie(
         self,
         tied_templates: list[str],
@@ -354,6 +441,140 @@ def _break_tie(
         logger.debug(f"Tie-breaker: {alphabetical_first} (alphabetically first)")
         return alphabetical_first
 
+    def get_detection_explanation(
+        self, pdf_path: Path, first_page: Page
+    ) -> DetectionExplanation:
+        """Return a structured explanation of template scoring for a given PDF.
+
+        Runs the same scoring logic as detect_template() but returns full
+        diagnostic detail rather than just the winning template.
+
+        Args:
+            pdf_path: Path to the PDF file.
+            first_page: First page of the PDF.
+
+        Returns:
+            DetectionExplanation with per-template scores, breakdown, and
+            tie-break metadata.
+        """
+        document_type = self._classify_document_type(first_page)
+
+        if document_type:
+            templates = self.registry.get_templates_by_type(document_type)
+            if not templates:
+                templates = self.registry.get_all_templates()
+        else:
+            templates = self.registry.get_all_templates()
+
+        threshold = self._scoring.min_confidence_threshold
+
+        if not templates:
+            return DetectionExplanation(
+                selected_template_id=None,
+                selected_score=0.0,
+                threshold=threshold,
+                passed_threshold=False,
+                per_template_scores={},
+                per_template_breakdown={},
+                tie_broken=False,
+                tie_winner_reason=None,
+                used_default=True,
+                default_reason="no templates available",
+            )
+
+        template_scores, template_details, excluded_templates = self._run_scoring(
+            pdf_path, first_page, templates
+        )
+
+        valid_scores = {
+            tid: score
+            for tid, score in template_scores.items()
+            if tid not in excluded_templates and score > 0.0
+        }
+
+        per_template_breakdown: dict[str, list[str]] = {
+            tid: [
+                f"{d.detector_name}={d.confidence:.2f}"
+                f"*{self._scoring.weight_for(d.detector_name):.1f}"
+                f"={d.confidence * self._scoring.weight_for(d.detector_name):.2f}"
+                for d in sorted(details, key=lambda x: x.confidence, reverse=True)
+            ]
+            for tid, details in template_details.items()
+        }
+
+        if not valid_scores:
+            return DetectionExplanation(
+                selected_template_id=None,
+                selected_score=0.0,
+                threshold=threshold,
+                passed_threshold=False,
+                per_template_scores=valid_scores,
+                per_template_breakdown=per_template_breakdown,
+                tie_broken=False,
+                tie_winner_reason=None,
+                used_default=True,
+                default_reason="no templates scored above zero",
+            )
+
+        best_template_id = max(valid_scores, key=lambda x: valid_scores[x])
+        best_score = valid_scores[best_template_id]
+
+        if best_score < threshold:
+            return DetectionExplanation(
+                selected_template_id=best_template_id,
+                selected_score=best_score,
+                threshold=threshold,
+                passed_threshold=False,
+                per_template_scores=valid_scores,
+                per_template_breakdown=per_template_breakdown,
+                tie_broken=False,
+                tie_winner_reason=None,
+                used_default=True,
+                default_reason=f"best score {best_score:.2f} below threshold {threshold}",
+            )
+
+        tied_templates = [
+            tid for tid, score in valid_scores.items() if score == best_score
+        ]
+        tie_broken = len(tied_templates) > 1
+        tie_winner_reason: str | None = None
+
+        if tie_broken:
+            # Determine reason without re-running full tie-break
+            has_iban = {
+                tid: any(d.detector_name == "IBAN" for d in template_details.get(tid, []))
+                for tid in tied_templates
+            }
+            if any(has_iban.values()):
+                tie_winner_reason = "IBAN match"
+            else:
+                max_confs = {
+                    tid: max(
+                        (d.confidence for d in template_details.get(tid, [])), default=0.0
+                    )
+                    for tid in tied_templates
+                }
+                top_max = max(max_confs.values())
+                winners_by_max = [tid for tid, c in max_confs.items() if c == top_max]
+                if len(winners_by_max) == 1:
+                    tie_winner_reason = "max confidence"
+                else:
+                    tie_winner_reason = "alphabetical"
+            best_template_id = self._break_tie(tied_templates, template_details)
+
+        return DetectionExplanation(
+            selected_template_id=best_template_id,
+            selected_score=best_score,
+            threshold=threshold,
+            passed_threshold=True,
+            per_template_scores=valid_scores,
+            per_template_breakdown=per_template_breakdown,
+            tie_broken=tie_broken,
+            tie_winner_reason=tie_winner_reason,
+            used_default=False,
+            default_reason=None,
+        )
+
     def force_template(self, template_id: str) -> BankTemplate | None:
         """Force use of specific template by ID.
 
diff --git a/packages/parser-core/tests/templates/test_template_detector.py b/packages/parser-core/tests/templates/test_template_detector.py
index e71ff62..5f58153 100644
--- a/packages/parser-core/tests/templates/test_template_detector.py
+++ b/packages/parser-core/tests/templates/test_template_detector.py
@@ -8,7 +8,11 @@
 import pytest
 
 from bankstatements_core.templates.detectors.base import DetectionResult
-from bankstatements_core.templates.template_detector import TemplateDetector
+from bankstatements_core.templates.template_detector import (
+    DetectionExplanation,
+    ScoringConfig,
+    TemplateDetector,
+)
 from bankstatements_core.templates.template_model import (
     BankTemplate,
     TemplateDetectionConfig,
@@ -808,3 +812,267 @@ def test_detect_uses_type_specific_default(
         # Should call get_default_for_type with "credit_card_statement"
         mock_registry.get_default_for_type.assert_called_with("credit_card_statement")
         assert result == credit_card_default
+
+    # -----------------------------------------------------------------------
+    # ScoringConfig tests
+    # -----------------------------------------------------------------------
+
+    @patch(
+        "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect"
+    )
+    @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect")
+    @patch(
+        "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect"
+    )
+    def test_threshold_boundary_passes(
+        self,
+        mock_column_detect,
+        mock_header_detect,
+        mock_filename_detect,
+        mock_loan_detect,
+        mock_card_detect,
+        mock_iban_detect,
+        mock_exclusion_detect,
+        mock_registry,
+        mock_page,
+    ):
+        """Filename score exactly at threshold (0.75 * 0.8 = 0.60 >= 0.60) selects template."""
+        revolut_template = mock_registry.get_all_templates()[1]
+
+        mock_exclusion_detect.return_value = []
+        mock_iban_detect.return_value = []
+        mock_card_detect.return_value = []
+        mock_loan_detect.return_value = []
+        mock_header_detect.return_value = []
+        mock_column_detect.return_value = []
+        mock_filename_detect.return_value = [
+            DetectionResult(
+                template=revolut_template,
+                confidence=0.75,  # 0.75 * 0.8 = 0.60 — exactly at threshold
+                detector_name="Filename",
+                match_details={},
+            )
+        ]
+
+        scoring = ScoringConfig(
+            weights={"Filename": 0.8},
+            min_confidence_threshold=0.60,
+        )
+        detector = TemplateDetector(mock_registry, scoring=scoring)
+        result = detector.detect_template(Path("test.pdf"), mock_page)
+
+        assert result == revolut_template
+
+    @patch(
+        "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect"
+    )
+    @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect")
+    @patch(
+        "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect"
+    )
+    def test_threshold_boundary_fails(
+        self,
+        mock_column_detect,
+        mock_header_detect,
+        mock_filename_detect,
+        mock_loan_detect,
+        mock_card_detect,
+        mock_iban_detect,
+        mock_exclusion_detect,
+        mock_registry,
+        mock_page,
+    ):
+        """Filename score just below threshold (0.74 * 0.8 = 0.592 < 0.60) uses default."""
+        revolut_template = mock_registry.get_all_templates()[1]
+        default_template = mock_registry.get_default_template()
+
+        mock_exclusion_detect.return_value = []
+        mock_iban_detect.return_value = []
+        mock_card_detect.return_value = []
+        mock_loan_detect.return_value = []
+        mock_header_detect.return_value = []
+        mock_column_detect.return_value = []
+        mock_filename_detect.return_value = [
+            DetectionResult(
+                template=revolut_template,
+                confidence=0.74,  # 0.74 * 0.8 = 0.592 — just below threshold
+                detector_name="Filename",
+                match_details={},
+            )
+        ]
+
+        scoring = ScoringConfig(
+            weights={"Filename": 0.8},
+            min_confidence_threshold=0.60,
+        )
+        detector = TemplateDetector(mock_registry, scoring=scoring)
+        result = detector.detect_template(Path("test.pdf"), mock_page)
+
+        assert result == default_template
+
+    @patch(
+        "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect"
+    )
+    @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect")
+    @patch(
+        "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect"
+    )
+    def test_weight_ordering_iban_beats_column_header(
+        self,
+        mock_column_detect,
+        mock_header_detect,
+        mock_filename_detect,
+        mock_loan_detect,
+        mock_card_detect,
+        mock_iban_detect,
+        mock_exclusion_detect,
+        mock_registry,
+        mock_page,
+    ):
+        """IBAN weight 2.0 (0.5*2.0=1.0) beats ColumnHeader weight 1.5 (0.6*1.5=0.9)."""
+        aib_template = mock_registry.get_all_templates()[0]
+        revolut_template = mock_registry.get_all_templates()[1]
+
+        mock_exclusion_detect.return_value = []
+        mock_card_detect.return_value = []
+        mock_loan_detect.return_value = []
+        mock_filename_detect.return_value = []
+        mock_header_detect.return_value = []
+        # AIB scores via IBAN: 0.5 * 2.0 = 1.0
+        mock_iban_detect.return_value = [
+            DetectionResult(
+                template=aib_template,
+                confidence=0.5,
+                detector_name="IBAN",
+                match_details={},
+            )
+        ]
+        # Revolut scores via ColumnHeader: 0.6 * 1.5 = 0.9
+        mock_column_detect.return_value = [
+            DetectionResult(
+                template=revolut_template,
+                confidence=0.6,
+                detector_name="ColumnHeader",
+                match_details={},
+            )
+        ]
+
+        scoring = ScoringConfig.default()
+        detector = TemplateDetector(mock_registry, scoring=scoring)
+        result = detector.detect_template(Path("test.pdf"), mock_page)
+
+        # AIB wins: 1.0 > 0.9
+        assert result == aib_template
+
+    @patch(
+        "bankstatements_core.templates.detectors.exclusion_detector.ExclusionDetector.detect"
+    )
+    @patch("bankstatements_core.templates.detectors.iban_detector.IBANDetector.detect")
+    @patch(
+        "bankstatements_core.templates.detectors.card_number_detector.CardNumberDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.loan_reference_detector.LoanReferenceDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.filename_detector.FilenameDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.header_detector.HeaderDetector.detect"
+    )
+    @patch(
+        "bankstatements_core.templates.detectors.column_header_detector.ColumnHeaderDetector.detect"
+    )
+    def test_get_detection_explanation_tie_break_reason(
+        self,
+        mock_column_detect,
+        mock_header_detect,
+        mock_filename_detect,
+        mock_loan_detect,
+        mock_card_detect,
+        mock_iban_detect,
+        mock_exclusion_detect,
+        mock_registry,
+        mock_page,
+    ):
+        """get_detection_explanation populates tie_winner_reason when a tie is broken."""
+        aib_template = mock_registry.get_all_templates()[0]
+        revolut_template = mock_registry.get_all_templates()[1]
+
+        mock_exclusion_detect.return_value = []
+        mock_card_detect.return_value = []
+        mock_loan_detect.return_value = []
+        mock_filename_detect.return_value = []
+        # Create a tie: AIB via IBAN (0.25*2.0=0.5) + Header (0.3*1.0=0.3) = 0.8
+        #               Revolut via Header (0.8*1.0=0.8)                    = 0.8
+        mock_iban_detect.return_value = [
+            DetectionResult(
+                template=aib_template,
+                confidence=0.25,
+                detector_name="IBAN",
+                match_details={},
+            )
+        ]
+        mock_header_detect.return_value = [
+            DetectionResult(
+                template=aib_template,
+                confidence=0.3,
+                detector_name="Header",
+                match_details={},
+            ),
+            DetectionResult(
+                template=revolut_template,
+                confidence=0.8,
+                detector_name="Header",
+                match_details={},
+            ),
+        ]
+        mock_column_detect.return_value = []
+
+        detector = TemplateDetector(mock_registry)
+        explanation = detector.get_detection_explanation(Path("test.pdf"), mock_page)
+
+        assert isinstance(explanation, DetectionExplanation)
+        assert explanation.tie_broken is True
+        assert explanation.tie_winner_reason == "IBAN match"
+        assert explanation.selected_template_id == "aib"
+        assert explanation.passed_threshold is True
+        assert explanation.used_default is False
+        assert "aib" in explanation.per_template_scores
+        assert "revolut" in explanation.per_template_scores

From 55b66434741a6f848ed051afa4a5a8b503a6da9b Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 07:54:56 +0000
Subject: [PATCH 02/14] feat(23-01): wire PDFTableExtractor.extract() and
 facade to return ExtractionResult

- PDFTableExtractor.extract() now returns ExtractionResult (not 3-tuple)
- Credit card early-return path returns ExtractionResult with warnings entry
- Normal return path wraps dicts_to_transactions(rows) in ExtractionResult
- extraction_facade.extract_tables_from_pdf() return annotation -> ExtractionResult
- All 6 extraction-layer test files updated: field access replaces tuple unpacking
- ExtractionOrchestrator.extract_from_pdf() unpacks ExtractionResult, re-packs as tuple for upstream callers (Rule 3 - blocking fix)
- All mock return values in test_excluded_files_logging, test_processor, test_processor_refactored_methods, test_extraction_orchestrator updated to ExtractionResult
- 1360 tests pass, 9 skipped (all expected)
---
 .../extraction/extraction_facade.py           |  6 +-
 .../extraction/pdf_extractor.py               | 22 ++++-
 .../services/extraction_orchestrator.py       |  5 +-
 .../tests/extraction/test_page_skipping.py    | 53 ++++++------
 .../tests/extraction/test_pdf_extractor.py    | 49 +++++------
 .../services/test_extraction_orchestrator.py  | 33 +++++---
 .../templates/test_template_integration.py    |  6 +-
 .../tests/test_credit_card_detection.py       | 81 ++++++++++++-------
 .../tests/test_excluded_files_logging.py      | 64 +++++++++++----
 .../tests/test_pdf_table_extractor.py         | 72 +++++++++--------
 packages/parser-core/tests/test_processor.py  | 22 ++++-
 .../test_processor_refactored_methods.py      | 23 ++++--
 .../test_transaction_data_no_blocking.py      | 44 +++++-----
 13 files changed, 302 insertions(+), 178 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py
index 57e3d34..7fdc3d6 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py
@@ -12,6 +12,7 @@
 
 from bankstatements_core.config.column_config import DEFAULT_COLUMNS
 from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y
+from bankstatements_core.domain import ExtractionResult
 
 if TYPE_CHECKING:
     from bankstatements_core.extraction.row_classifiers import RowClassifier
@@ -70,7 +71,7 @@ def extract_tables_from_pdf(
     enable_page_validation: bool | None = None,
     enable_header_check: bool | None = None,
     template: "BankTemplate" | None = None,
-) -> tuple[list[dict], int, str | None]:
+) -> ExtractionResult:
     """
     Extract table data from PDF within specified bounds (facade function).
 
@@ -87,7 +88,8 @@ def extract_tables_from_pdf(
         template: Optional BankTemplate to use for extraction configuration
 
     Returns:
-        Tuple of (extracted rows, number of pages, IBAN if found)
+        ExtractionResult containing extracted transactions, page count, IBAN,
+        source file path, and any document-level warnings
     """
     from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
 
diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py
index d0e2652..549e6c2 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py
@@ -22,6 +22,8 @@
     StatefulPageRowProcessor,
     extract_filename_date,
 )
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 
 logger = logging.getLogger(__name__)
 
@@ -72,14 +74,15 @@ def __init__(
         else:
             self._pdf_reader = pdf_reader
 
-    def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]:
+    def extract(self, pdf_path: Path) -> ExtractionResult:
         """Extract table data from PDF file.
 
         Args:
             pdf_path: Path to the PDF file
 
         Returns:
-            Tuple of (extracted rows, total page count, IBAN if found)
+            ExtractionResult containing extracted transactions, page count,
+            IBAN if found, source file path, and any document-level warnings
         """
         rows: list[dict] = []
         iban = None
@@ -104,7 +107,13 @@ def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]:
                         f"Credit card statement detected in {pdf_path.name}. "
                         f"Credit card statements are not currently supported. Skipping file."
                     )
-                    return [], len(pdf.pages), None
+                    return ExtractionResult(
+                        transactions=[],
+                        page_count=len(pdf.pages),
+                        iban=None,
+                        source_file=pdf_path,
+                        warnings=["credit card statement detected, skipped"],
+                    )
 
                 if iban is None and page_num == 1:
                     iban = self._header_analyser.extract_iban(page)
@@ -124,7 +133,12 @@ def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]:
 
                 rows.extend(page_processor.process_page(page_rows))
 
-            return rows, len(pdf.pages), iban
+            return ExtractionResult(
+                transactions=dicts_to_transactions(rows),
+                page_count=len(pdf.pages),
+                iban=iban,
+                source_file=pdf_path,
+            )
 
     def _extract_page(self, page: Any, page_num: int) -> list[dict] | None:
         """Extract rows from a single page.
diff --git a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py
index 2e1efa2..4cd9b14 100644
--- a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py
+++ b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py
@@ -158,7 +158,7 @@ def extract_from_pdf(
             logger.info(f"Using template: {template.name} for {pdf_path.name}")
 
         # Extract transactions using template
-        rows, page_count, iban = extract_tables_from_pdf(
+        extraction_result = extract_tables_from_pdf(
             pdf_path,
             self._config.table_top_y,
             self._config.table_bottom_y,
@@ -166,6 +166,9 @@ def extract_from_pdf(
             self._config.enable_dynamic_boundary,
             template=template,
         )
+        rows = [tx.to_dict() for tx in extraction_result.transactions]
+        page_count = extraction_result.page_count
+        iban = extraction_result.iban
 
         # Log IBAN if found
         if iban:
diff --git a/packages/parser-core/tests/extraction/test_page_skipping.py b/packages/parser-core/tests/extraction/test_page_skipping.py
index fd9ced3..fe224d8 100644
--- a/packages/parser-core/tests/extraction/test_page_skipping.py
+++ b/packages/parser-core/tests/extraction/test_page_skipping.py
@@ -13,6 +13,7 @@
 
 import pytest
 
+from bankstatements_core.domain import ExtractionResult
 from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
 
 # Test columns configuration
@@ -94,13 +95,13 @@ def test_skip_page_without_headers_and_continue(self, mock_pdfplumber):
         extractor = PDFTableExtractor(
             columns=TEST_COLUMNS, enable_dynamic_boundary=True
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Assertions
-        assert page_count == 3  # All pages were examined
-        assert len(rows) == 2  # Only 2 transactions (from pages 2 and 3)
-        assert rows[0]["Filename"] == "test.pdf"
-        assert rows[1]["Filename"] == "test.pdf"
+        assert result.page_count == 3  # All pages were examined
+        assert len(result.transactions) == 2  # Only 2 transactions (from pages 2 and 3)
+        assert result.transactions[0].filename == "test.pdf"
+        assert result.transactions[1].filename == "test.pdf"
 
     @patch.dict("os.environ", {"REQUIRE_TABLE_HEADERS": "true"})
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
@@ -137,10 +138,10 @@ def test_skip_all_pages_without_tables(self, mock_pdfplumber):
         extractor = PDFTableExtractor(
             columns=TEST_COLUMNS, enable_dynamic_boundary=True
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 2  # All pages were examined
-        assert len(rows) == 0  # No transactions extracted
+        assert result.page_count == 2  # All pages were examined
+        assert len(result.transactions) == 0  # No transactions extracted
 
     @patch.dict("os.environ", {"REQUIRE_TABLE_HEADERS": "true"})
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
@@ -174,10 +175,10 @@ def test_process_all_pages_with_tables(self, mock_pdfplumber):
         extractor = PDFTableExtractor(
             columns=TEST_COLUMNS, enable_dynamic_boundary=True
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 3
-        assert len(rows) == 3  # One transaction from each page
+        assert result.page_count == 3
+        assert len(result.transactions) == 3  # One transaction from each page
 
     @patch.dict("os.environ", {"REQUIRE_TABLE_HEADERS": "true"})
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
@@ -234,10 +235,10 @@ def test_skip_middle_page_without_table(self, mock_pdfplumber):
         extractor = PDFTableExtractor(
             columns=TEST_COLUMNS, enable_dynamic_boundary=True
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 3  # All 3 pages examined
-        assert len(rows) == 2  # Only pages 1 and 3 had transactions
+        assert result.page_count == 3  # All 3 pages examined
+        assert len(result.transactions) == 2  # Only pages 1 and 3 had transactions
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_page_validation_disabled_processes_all_pages(self, mock_pdfplumber):
@@ -262,11 +263,11 @@ def test_page_validation_disabled_processes_all_pages(self, mock_pdfplumber):
         extractor = PDFTableExtractor(
             columns=TEST_COLUMNS, enable_page_validation=False
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1
+        assert result.page_count == 1
         # Should process page even without headers
-        assert len(rows) >= 0  # May or may not have valid transactions
+        assert len(result.transactions) >= 0  # May or may not have valid transactions
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_validation_enabled_by_default(self, mock_pdfplumber):
@@ -298,10 +299,10 @@ def test_validation_enabled_by_default(self, mock_pdfplumber):
         assert extractor.header_check_enabled is True
 
         # Extract and verify page is skipped due to missing headers
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1  # Page was examined
-        assert len(rows) == 0  # Page was skipped (no headers)
+        assert result.page_count == 1  # Page was examined
+        assert len(result.transactions) == 0  # Page was skipped (no headers)
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_empty_pdf_returns_zero_rows(self, mock_pdfplumber):
@@ -311,10 +312,10 @@ def test_empty_pdf_returns_zero_rows(self, mock_pdfplumber):
         mock_pdfplumber.return_value = mock_pdf
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/empty.pdf"))
+        result = extractor.extract(Path("/tmp/empty.pdf"))
 
-        assert page_count == 0
-        assert len(rows) == 0
+        assert result.page_count == 0
+        assert len(result.transactions) == 0
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_page_with_insufficient_rows_skipped(self, mock_pdfplumber):
@@ -336,8 +337,8 @@ def test_page_with_insufficient_rows_skipped(self, mock_pdfplumber):
         mock_cropped.extract_words.return_value = mock_words
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS, enable_page_validation=True)
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1
+        assert result.page_count == 1
         # Should be skipped due to insufficient transaction rows
-        assert len(rows) == 0
+        assert len(result.transactions) == 0
diff --git a/packages/parser-core/tests/extraction/test_pdf_extractor.py b/packages/parser-core/tests/extraction/test_pdf_extractor.py
index dffd7c8..5fa3e2b 100644
--- a/packages/parser-core/tests/extraction/test_pdf_extractor.py
+++ b/packages/parser-core/tests/extraction/test_pdf_extractor.py
@@ -7,6 +7,8 @@
 
 import pytest
 
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
 from bankstatements_core.extraction.row_post_processor import (
     RowPostProcessor,
@@ -189,11 +191,12 @@ def test_extract_basic_flow(self, mock_pdfplumber):
             enable_page_validation=False,
             enable_header_check=False,
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1
-        assert len(rows) == 1
-        assert rows[0]["Filename"] == "test.pdf"
+        assert isinstance(result, ExtractionResult)
+        assert result.page_count == 1
+        assert len(result.transactions) == 1
+        assert result.transactions[0].filename == "test.pdf"
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_extract_with_dynamic_boundary(self, mock_pdfplumber):
@@ -224,9 +227,9 @@ def test_extract_with_dynamic_boundary(self, mock_pdfplumber):
             enable_page_validation=False,
             enable_header_check=False,
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1
+        assert result.page_count == 1
         # Crop is called for: credit card check (header), table extraction (initial + final)
         assert mock_page.crop.call_count >= 2  # At least initial + final for table
 
@@ -249,11 +252,11 @@ def test_extract_with_page_validation_failure(self, mock_pdfplumber):
         mock_cropped.extract_words.return_value = mock_words
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS, enable_page_validation=True)
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1
+        assert result.page_count == 1
         # No valid transactions, so rows should be empty
-        assert len(rows) == 0
+        assert len(result.transactions) == 0
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_extract_multiple_pages(self, mock_pdfplumber):
@@ -289,10 +292,10 @@ def test_extract_multiple_pages(self, mock_pdfplumber):
             enable_page_validation=False,
             enable_header_check=False,
         )
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 2
-        assert len(rows) == 2
+        assert result.page_count == 2
+        assert len(result.transactions) == 2
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_extract_date_propagation_across_rows(self, mock_pdfplumber):
@@ -329,12 +332,12 @@ def test_extract_date_propagation_across_rows(self, mock_pdfplumber):
             enable_page_validation=False,
             enable_header_check=False,
         )
-        rows, _, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert len(rows) == 2
+        assert len(result.transactions) == 2
         # Both rows should have the same date (2023 is captured if x1 is provided for all words)
-        assert "01 Jan 2023" in rows[0]["Date"]
-        assert "01 Jan 2023" in rows[1]["Date"]
+        assert "01 Jan 2023" in result.transactions[0].date
+        assert "01 Jan 2023" in result.transactions[1].date
 
     def test_page_validation_constructor_parameter(self):
         """Test page validation setting via constructor parameter."""
@@ -431,10 +434,10 @@ def test_per_page_boundaries_applied_correctly(self, mock_pdfplumber):
             enable_header_check=False,
         )
 
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 2
-        assert len(rows) == 2
+        assert result.page_count == 2
+        assert len(result.transactions) == 2
 
         # Verify crop was called with correct boundaries
         # Page 1 should use override (490)
@@ -480,10 +483,10 @@ def test_per_page_boundaries_without_extraction_config(self, mock_pdfplumber):
             enable_header_check=False,
         )
 
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
-        assert page_count == 1
-        assert len(rows) == 1
+        assert result.page_count == 1
+        assert len(result.transactions) == 1
 
         # Verify crop was called with instance defaults (300)
         crop_calls = [
@@ -535,7 +538,7 @@ def test_per_page_header_check_top_y_override(self, mock_pdfplumber):
             enable_header_check=True,  # Enable header check
         )
 
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Verify header area crop was called with override value (450)
         header_crop_calls = [
diff --git a/packages/parser-core/tests/services/test_extraction_orchestrator.py b/packages/parser-core/tests/services/test_extraction_orchestrator.py
index 890a798..24aaf9e 100644
--- a/packages/parser-core/tests/services/test_extraction_orchestrator.py
+++ b/packages/parser-core/tests/services/test_extraction_orchestrator.py
@@ -8,6 +8,8 @@
 from unittest.mock import MagicMock, patch
 
 from bankstatements_core.config.processor_config import ExtractionConfig
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.services.extraction_orchestrator import ExtractionOrchestrator
 
 
@@ -34,10 +36,11 @@ def test_extract_from_pdf_success(self, mock_extract):
         pdf_path.write_text("fake pdf")
 
         # Mock successful extraction
-        mock_extract.return_value = (
-            [{"Date": "01/01/23", "Details": "Test"}],
-            5,
-            "IE12BOFI90000112345",
+        mock_extract.return_value = ExtractionResult(
+            transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test"}]),
+            page_count=5,
+            iban="IE12BOFI90000112345",
+            source_file=pdf_path,
         )
 
         rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path)
@@ -59,7 +62,12 @@ def test_extract_from_pdf_with_forced_template(self, mock_extract):
         mock_template = MagicMock()
         mock_template.name = "TestTemplate"
 
-        mock_extract.return_value = ([{"Date": "01/01/23"}], 3, None)
+        mock_extract.return_value = ExtractionResult(
+            transactions=dicts_to_transactions([{"Date": "01/01/23"}]),
+            page_count=3,
+            iban=None,
+            source_file=pdf_path,
+        )
 
         rows, pages, iban = self.orchestrator.extract_from_pdf(
             pdf_path, forced_template=mock_template
@@ -79,7 +87,9 @@ def test_extract_from_pdf_no_data(self, mock_extract):
         pdf_path = Path(self.temp_dir) / "test.pdf"
         pdf_path.write_text("fake pdf")
 
-        mock_extract.return_value = ([], 2, None)
+        mock_extract.return_value = ExtractionResult(
+            transactions=[], page_count=2, iban=None, source_file=pdf_path
+        )
 
         rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path)
 
@@ -95,13 +105,14 @@ def test_extract_from_pdf_returns_rows_as_is(self, mock_extract):
         pdf_path = Path(self.temp_dir) / "test.pdf"
         pdf_path.write_text("fake pdf")
 
-        mock_extract.return_value = (
-            [
+        mock_extract.return_value = ExtractionResult(
+            transactions=dicts_to_transactions([
                 {"Date": "01/01/23", "Details": "Test1"},
                 {"Date": "02/01/23", "Details": "Test2"},
-            ],
-            3,
-            None,
+            ]),
+            page_count=3,
+            iban=None,
+            source_file=pdf_path,
         )
 
         rows, _, _ = self.orchestrator.extract_from_pdf(pdf_path)
diff --git a/packages/parser-core/tests/templates/test_template_integration.py b/packages/parser-core/tests/templates/test_template_integration.py
index fd25a7d..63b01ec 100644
--- a/packages/parser-core/tests/templates/test_template_integration.py
+++ b/packages/parser-core/tests/templates/test_template_integration.py
@@ -239,7 +239,7 @@ def test_revolut_pdf_extracts_all_transactions(self):
             "input/account-statement_2025-01-01_2026-01-30_en-ie_d04719.pdf"
         )
 
-        rows, page_count, iban = extract_tables_from_pdf(
+        result = extract_tables_from_pdf(
             pdf_path,
             table_top_y=revolut_template.extraction.table_top_y,
             table_bottom_y=revolut_template.extraction.table_bottom_y,
@@ -250,9 +250,9 @@ def test_revolut_pdf_extracts_all_transactions(self):
 
         # Should extract 67+ transactions from pages 2-4
         # (Page 1 and 5 have different structure and are skipped)
-        assert len(rows) >= 67
+        assert len(result.transactions) >= 67
         # Verify we have January 2025 transactions (not missing first 24)
-        dates = [row.get("Date", "") for row in rows if row.get("Date")]
+        dates = [tx.date for tx in result.transactions if tx.date]
         jan_transactions = [d for d in dates if "Jan 2025" in d]
         assert len(jan_transactions) >= 4  # At least 4 January transactions
 
diff --git a/packages/parser-core/tests/test_credit_card_detection.py b/packages/parser-core/tests/test_credit_card_detection.py
index d4d7160..85935a6 100644
--- a/packages/parser-core/tests/test_credit_card_detection.py
+++ b/packages/parser-core/tests/test_credit_card_detection.py
@@ -7,6 +7,7 @@
 
 import pytest
 
+from bankstatements_core.domain import ExtractionResult
 from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
 
 # Test columns configuration
@@ -40,12 +41,13 @@ def test_detect_credit_card_statement_with_card_number(self, mock_pdfplumber):
         """
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/credit_card.pdf"))
+        result = extractor.extract(Path("/tmp/credit_card.pdf"))
 
         # Should return empty rows (skipped)
-        assert len(rows) == 0
-        assert page_count == 1  # Still counts the pages
-        assert iban is None
+        assert isinstance(result, ExtractionResult)
+        assert len(result.transactions) == 0
+        assert result.page_count == 1  # Still counts the pages
+        assert result.iban is None
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_detect_credit_card_statement_case_insensitive(self, mock_pdfplumber):
@@ -67,9 +69,9 @@ def test_detect_credit_card_statement_case_insensitive(self, mock_pdfplumber):
             mock_page.extract_text.return_value = f"Statement\n{text}\nDetails"
 
             extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-            rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+            result = extractor.extract(Path("/tmp/test.pdf"))
 
-            assert len(rows) == 0, f"Failed to detect: {text}"
+            assert len(result.transactions) == 0, f"Failed to detect: {text}"
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_detect_credit_card_statement_with_credit_limit(self, mock_pdfplumber):
@@ -87,11 +89,11 @@ def test_detect_credit_card_statement_with_credit_limit(self, mock_pdfplumber):
         """
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/credit_card.pdf"))
+        result = extractor.extract(Path("/tmp/credit_card.pdf"))
 
-        assert len(rows) == 0
-        assert page_count == 1
-        assert iban is None
+        assert len(result.transactions) == 0
+        assert result.page_count == 1
+        assert result.iban is None
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_detect_credit_card_statement_with_credit_card_text(self, mock_pdfplumber):
@@ -107,10 +109,10 @@ def test_detect_credit_card_statement_with_credit_card_text(self, mock_pdfplumbe
         """
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/credit_card.pdf"))
+        result = extractor.extract(Path("/tmp/credit_card.pdf"))
 
-        assert len(rows) == 0
-        assert page_count == 1
+        assert len(result.transactions) == 0
+        assert result.page_count == 1
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_detect_credit_card_statement_with_visa(self, mock_pdfplumber):
@@ -126,10 +128,10 @@ def test_detect_credit_card_statement_with_visa(self, mock_pdfplumber):
         """
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/visa.pdf"))
+        result = extractor.extract(Path("/tmp/visa.pdf"))
 
-        assert len(rows) == 0
-        assert page_count == 1
+        assert len(result.transactions) == 0
+        assert result.page_count == 1
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_detect_credit_card_statement_with_mastercard(self, mock_pdfplumber):
@@ -145,10 +147,10 @@ def test_detect_credit_card_statement_with_mastercard(self, mock_pdfplumber):
         """
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/mastercard.pdf"))
+        result = extractor.extract(Path("/tmp/mastercard.pdf"))
 
-        assert len(rows) == 0
-        assert page_count == 1
+        assert len(result.transactions) == 0
+        assert result.page_count == 1
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_all_credit_card_patterns(self, mock_pdfplumber):
@@ -171,9 +173,9 @@ def test_all_credit_card_patterns(self, mock_pdfplumber):
             mock_page.extract_text.return_value = f"Header\n{text}\nFooter"
 
             extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-            rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+            result = extractor.extract(Path("/tmp/test.pdf"))
 
-            assert len(rows) == 0, f"Failed to detect with pattern: {pattern_name}"
+            assert len(result.transactions) == 0, f"Failed to detect with pattern: {pattern_name}"
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_does_not_detect_false_positives(self, mock_pdfplumber):
@@ -206,10 +208,10 @@ def test_does_not_detect_false_positives(self, mock_pdfplumber):
         mock_cropped.extract_words.return_value = mock_words
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/bank.pdf"))
+        result = extractor.extract(Path("/tmp/bank.pdf"))
 
         # Should process normally
-        assert page_count == 1
+        assert result.page_count == 1
         # May or may not have rows depending on validation, but shouldn't be empty due to CC detection
         # The key is it didn't skip due to credit card detection
 
@@ -231,10 +233,10 @@ def test_credit_card_detection_with_extraction_error(self, mock_pdfplumber):
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
         # Should not crash, should continue processing
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Should complete without raising exception
-        assert page_count == 1
+        assert result.page_count == 1
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_credit_card_detection_only_checks_first_page(self, mock_pdfplumber):
@@ -260,10 +262,10 @@ def test_credit_card_detection_only_checks_first_page(self, mock_pdfplumber):
         mock_cropped2.extract_words.return_value = []
 
         extractor = PDFTableExtractor(columns=TEST_COLUMNS)
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Should process both pages (not skipped)
-        assert page_count == 2
+        assert result.page_count == 2
 
     def test_is_credit_card_statement_method_directly(self):
         """Test the _is_credit_card_statement method directly."""
@@ -331,3 +333,28 @@ def test_visa_in_transaction_not_detected(self):
             )
             is False
         )
+
+    @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
+    def test_credit_card_early_return_produces_extraction_result_with_warning(
+        self, mock_pdfplumber
+    ):
+        """Test that credit card detection returns ExtractionResult with warning message."""
+        mock_pdf = MagicMock()
+        mock_page = MagicMock()
+        mock_page.width = 612
+        mock_pdf.pages = [mock_page]
+        mock_pdfplumber.return_value = mock_pdf
+
+        # Mock the cropped header area to return credit card text
+        mock_header = MagicMock()
+        mock_header.extract_text.return_value = "Statement for Card Number 1234"
+        mock_page.crop.return_value = mock_header
+
+        extractor = PDFTableExtractor(columns=TEST_COLUMNS)
+        result = extractor.extract(Path("/tmp/credit_card.pdf"))
+
+        assert isinstance(result, ExtractionResult)
+        assert len(result.transactions) == 0
+        assert result.iban is None
+        assert len(result.warnings) > 0
+        assert "credit card" in result.warnings[0].lower()
diff --git a/packages/parser-core/tests/test_excluded_files_logging.py b/packages/parser-core/tests/test_excluded_files_logging.py
index 8ab1d55..38492df 100644
--- a/packages/parser-core/tests/test_excluded_files_logging.py
+++ b/packages/parser-core/tests/test_excluded_files_logging.py
@@ -10,6 +10,8 @@
 import pytest
 
 from bankstatements_core.config.processor_config import ProcessorConfig
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
 
 
@@ -40,7 +42,11 @@ def test_excluded_files_json_created(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                mock_extract.return_value = ([], 1, None)  # Empty rows, 1 page, no IBAN
+                mock_extract.return_value = ExtractionResult(
+                    transactions=[], page_count=1, iban=None,
+                    source_file=Path("credit_card.pdf"),
+                    warnings=["credit card statement detected, skipped"],
+                )
 
                 processor = create_test_processor(input_dir, output_dir)
                 processor.run()
@@ -77,10 +83,13 @@ def test_excluded_files_json_not_created_when_no_exclusions(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                mock_extract.return_value = (
-                    [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}],
-                    1,
-                    "IE29AIBK93115212345678",
+                mock_extract.return_value = ExtractionResult(
+                    transactions=dicts_to_transactions([
+                        {"Date": "01/12/23", "Details": "Test", "Debit €": "100"}
+                    ]),
+                    page_count=1,
+                    iban="IE29AIBK93115212345678",
+                    source_file=Path("bank_statement.pdf"),
                 )
 
                 processor = create_test_processor(input_dir, output_dir)
@@ -107,7 +116,11 @@ def test_excluded_files_multiple_exclusions(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                mock_extract.return_value = ([], 2, None)
+                mock_extract.return_value = ExtractionResult(
+                    transactions=[], page_count=2, iban=None,
+                    source_file=Path("credit_card.pdf"),
+                    warnings=["credit card statement detected, skipped"],
+                )
 
                 processor = create_test_processor(input_dir, output_dir)
                 processor.run()
@@ -146,12 +159,19 @@ def test_excluded_files_mixed_scenarios(self):
             # Mock PDFTableExtractor to return different results
             def mock_extract_side_effect(pdf_path, *args, **kwargs):
                 if "credit_card" in str(pdf_path):
-                    return ([], 1, None)  # Excluded
+                    return ExtractionResult(
+                        transactions=[], page_count=1, iban=None,
+                        source_file=pdf_path,
+                        warnings=["credit card statement detected, skipped"],
+                    )
                 else:
-                    return (
-                        [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}],
-                        1,
-                        "IE29AIBK93115212345678",
+                    return ExtractionResult(
+                        transactions=dicts_to_transactions([
+                            {"Date": "01/12/23", "Details": "Test", "Debit €": "100"}
+                        ]),
+                        page_count=1,
+                        iban="IE29AIBK93115212345678",
+                        source_file=pdf_path,
                     )
 
             with patch(
@@ -187,7 +207,11 @@ def test_excluded_files_json_structure(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                mock_extract.return_value = ([], 3, None)  # 3 pages
+                mock_extract.return_value = ExtractionResult(
+                    transactions=[], page_count=3, iban=None,
+                    source_file=Path("test.pdf"),
+                    warnings=["credit card statement detected, skipped"],
+                )
 
                 processor = create_test_processor(input_dir, output_dir)
                 processor.run()
@@ -302,7 +326,10 @@ def test_pdf_with_iban_not_excluded(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                mock_extract.return_value = ([], 2, "IE29AIBK93115212345678")
+                mock_extract.return_value = ExtractionResult(
+                    transactions=[], page_count=2, iban="IE29AIBK93115212345678",
+                    source_file=Path("bank_statement.pdf"),
+                )
 
                 processor = create_test_processor(input_dir, output_dir)
                 processor.run()
@@ -333,10 +360,17 @@ def test_mixed_with_iban_and_without_iban(self):
             def mock_extract_side_effect(pdf_path, *args, **kwargs):
                 if "bank_with_iban" in str(pdf_path):
                     # Has IBAN, empty rows (should NOT be excluded)
-                    return ([], 2, "IE29AIBK93115212345678")
+                    return ExtractionResult(
+                        transactions=[], page_count=2, iban="IE29AIBK93115212345678",
+                        source_file=pdf_path,
+                    )
                 else:
                     # No IBAN, empty rows (SHOULD be excluded)
-                    return ([], 1, None)
+                    return ExtractionResult(
+                        transactions=[], page_count=1, iban=None,
+                        source_file=pdf_path,
+                        warnings=["credit card statement detected, skipped"],
+                    )
 
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
diff --git a/packages/parser-core/tests/test_pdf_table_extractor.py b/packages/parser-core/tests/test_pdf_table_extractor.py
index a9f8414..d515393 100644
--- a/packages/parser-core/tests/test_pdf_table_extractor.py
+++ b/packages/parser-core/tests/test_pdf_table_extractor.py
@@ -7,6 +7,8 @@
 from unittest.mock import MagicMock, Mock, patch
 
 from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.pdf_table_extractor import (
     DEFAULT_COLUMNS,
     analyze_content_density,
@@ -67,29 +69,29 @@ def test_extract_tables_with_date_propagation(self, mock_pdfplumber):
 
         # Test the extraction
         test_pdf_path = Path("/tmp/test.pdf")
-        rows, page_count, iban = extract_tables_from_pdf(
+        result = extract_tables_from_pdf(
             test_pdf_path, enable_page_validation=False, enable_header_check=False
         )
 
         # Verify results
-        self.assertEqual(page_count, 1)
-        self.assertEqual(len(rows), 3)
+        self.assertEqual(result.page_count, 1)
+        self.assertEqual(len(result.transactions), 3)
 
         # Check first transaction (has original date)
-        self.assertEqual(rows[0]["Date"].strip(), "01 Jan 2024")
-        self.assertEqual(rows[0]["Details"].strip(), "Salary Payment")
-        self.assertEqual(rows[0]["Credit €"].strip(), "3000.00")
-        self.assertEqual(rows[0]["Filename"], "test.pdf")
+        self.assertEqual(result.transactions[0].date.strip(), "01 Jan 2024")
+        self.assertEqual(result.transactions[0].details.strip(), "Salary Payment")
+        self.assertEqual(result.transactions[0].credit.strip(), "3000.00")
+        self.assertEqual(result.transactions[0].filename, "test.pdf")
 
         # Check second transaction (should inherit date from first)
-        self.assertEqual(rows[1]["Date"].strip(), "01 Jan 2024")  # Inherited!
-        self.assertEqual(rows[1]["Details"].strip(), "Coffee Shop")
-        self.assertEqual(rows[1]["Debit €"].strip(), "5.50")
+        self.assertEqual(result.transactions[1].date.strip(), "01 Jan 2024")  # Inherited!
+        self.assertEqual(result.transactions[1].details.strip(), "Coffee Shop")
+        self.assertEqual(result.transactions[1].debit.strip(), "5.50")
 
         # Check third transaction (has new date)
-        self.assertEqual(rows[2]["Date"].strip(), "02 Jan 2024")
-        self.assertEqual(rows[2]["Details"].strip(), "Gas Station")
-        self.assertEqual(rows[2]["Debit €"].strip(), "40.00")
+        self.assertEqual(result.transactions[2].date.strip(), "02 Jan 2024")
+        self.assertEqual(result.transactions[2].details.strip(), "Gas Station")
+        self.assertEqual(result.transactions[2].debit.strip(), "40.00")
 
     @patch("bankstatements_core.pdf_table_extractor.pdfplumber.open")
     def test_extract_tables_filename_tagging(self, mock_pdfplumber):
@@ -117,13 +119,13 @@ def test_extract_tables_filename_tagging(self, mock_pdfplumber):
 
         # Test with specific filename - pass parameters directly instead of using env vars
         test_pdf_path = Path("/tmp/bank_statement_jan2024.pdf")
-        rows, page_count, iban = extract_tables_from_pdf(
+        result = extract_tables_from_pdf(
             test_pdf_path, enable_page_validation=False, enable_header_check=False
         )
 
         # Verify filename tagging
-        self.assertEqual(len(rows), 1)
-        self.assertEqual(rows[0]["Filename"], "bank_statement_jan2024.pdf")
+        self.assertEqual(len(result.transactions), 1)
+        self.assertEqual(result.transactions[0].filename, "bank_statement_jan2024.pdf")
 
     @patch("bankstatements_core.pdf_table_extractor.pdfplumber.open")
     def test_extract_tables_empty_rows_filtered(self, mock_pdfplumber):
@@ -153,13 +155,13 @@ def test_extract_tables_empty_rows_filtered(self, mock_pdfplumber):
         mock_cropped.extract_words.return_value = mock_words
 
         test_pdf_path = Path("/tmp/test.pdf")
-        rows, page_count, iban = extract_tables_from_pdf(
+        result = extract_tables_from_pdf(
             test_pdf_path, enable_page_validation=False, enable_header_check=False
         )
 
         # Should only have 2 valid rows (empty row filtered out)
-        self.assertEqual(len(rows), 2)
-        self.assertTrue(all(any(row.values()) for row in rows))
+        self.assertEqual(len(result.transactions), 2)
+        self.assertTrue(all(any(t.to_dict().values()) for t in result.transactions))
 
     def test_classify_row_type(self):
         """Test row classification for transaction vs non-transaction content"""
@@ -405,7 +407,7 @@ def crop_side_effect(*args):
             test_pdf_path = Path("/tmp/test.pdf")
 
             # Test with dynamic boundary enabled
-            dynamic_rows, _, iban = extract_tables_from_pdf(
+            dynamic_result = extract_tables_from_pdf(
                 test_pdf_path,
                 enable_dynamic_boundary=True,
                 enable_page_validation=False,
@@ -413,8 +415,8 @@ def crop_side_effect(*args):
             )
 
             # Should have fewer rows (administrative content filtered)
-            self.assertEqual(len(dynamic_rows), 1)
-            self.assertEqual(dynamic_rows[0]["Details"].strip(), "VDC-STORE")
+            self.assertEqual(len(dynamic_result.transactions), 1)
+            self.assertEqual(dynamic_result.transactions[0].details.strip(), "VDC-STORE")
 
         run_test()
 
@@ -636,26 +638,26 @@ def test_extract_tables_with_page_validation_enabled(self, mock_pdfplumber):
         test_pdf_path = Path("/tmp/test.pdf")
 
         # Test with page validation enabled
-        rows, page_count, iban = extract_tables_from_pdf(
+        result = extract_tables_from_pdf(
             test_pdf_path, enable_page_validation=True
         )
 
         # Should have processed only the valid page
-        self.assertEqual(page_count, 2)  # Total pages processed
-        self.assertEqual(len(rows), 3)  # Only rows from valid page
+        self.assertEqual(result.page_count, 2)  # Total pages processed
+        self.assertEqual(len(result.transactions), 3)  # Only rows from valid page
 
         # Verify all returned rows are from valid transactions
-        for row in rows:
-            self.assertTrue(any(row.values()))  # Non-empty rows
-            self.assertEqual(row["Filename"], "test.pdf")
+        for tx in result.transactions:
+            self.assertTrue(any(tx.to_dict().values()))  # Non-empty rows
+            self.assertEqual(tx.filename, "test.pdf")
 
         # Test with page validation disabled
-        rows_no_validation, _, iban = extract_tables_from_pdf(
+        result_no_val = extract_tables_from_pdf(
             test_pdf_path, enable_page_validation=False
         )
 
         # Should process both pages (more rows due to invalid page content)
-        self.assertGreaterEqual(len(rows_no_validation), len(rows))
+        self.assertGreaterEqual(len(result_no_val.transactions), len(result.transactions))
 
     def test_analyze_content_density_empty_word_groups(self):
         """Test analyze_content_density with empty word groups."""
@@ -930,13 +932,13 @@ def test_extract_tables_with_header_check_enabled_no_headers(self, mock_pdfplumb
             test_pdf_path = Path("/tmp/test.pdf")
 
             # Should skip the page due to no headers
-            rows, page_count, iban = extract_tables_from_pdf(
+            result = extract_tables_from_pdf(
                 test_pdf_path, enable_dynamic_boundary=True
             )
 
             # Should return empty results since page was skipped
-            self.assertEqual(len(rows), 0)
-            self.assertEqual(page_count, 1)  # Still counts the page
+            self.assertEqual(len(result.transactions), 0)
+            self.assertEqual(result.page_count, 1)  # Still counts the page
 
         finally:
             # Restore environment
@@ -968,12 +970,12 @@ def test_extract_tables_static_mode_header_check(self, mock_pdfplumber):
             test_pdf_path = Path("/tmp/test.pdf")
 
             # Should skip page in static mode too
-            rows, page_count, iban = extract_tables_from_pdf(
+            result = extract_tables_from_pdf(
                 test_pdf_path, enable_dynamic_boundary=False  # Static mode
             )
 
             # Should return empty results
-            self.assertEqual(len(rows), 0)
+            self.assertEqual(len(result.transactions), 0)
 
         finally:
             if old_header_check:
diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py
index 4d193e8..6c22ec2 100644
--- a/packages/parser-core/tests/test_processor.py
+++ b/packages/parser-core/tests/test_processor.py
@@ -16,6 +16,8 @@
     ProcessingConfig,
     ProcessorConfig,
 )
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.processor import (
     BankStatementProcessor,
     calculate_column_totals,
@@ -274,9 +276,15 @@ def test_processor_continues_after_pdf_error(self, mock_extract):
 
         # Mock: first PDF succeeds, second fails, third succeeds
         mock_extract.side_effect = [
-            ([{"Date": "01 Jan 2024", "Details": "Transaction 1"}], 1, None),
+            ExtractionResult(
+                transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]),
+                page_count=1, iban=None, source_file=Path("good.pdf"),
+            ),
             OSError("Failed to open PDF"),
-            ([{"Date": "02 Jan 2024", "Details": "Transaction 2"}], 1, None),
+            ExtractionResult(
+                transactions=dicts_to_transactions([{"Date": "02 Jan 2024", "Details": "Transaction 2"}]),
+                page_count=1, iban=None, source_file=Path("good2.pdf"),
+            ),
         ]
 
         processor = create_test_processor(self.input_dir, self.output_dir)
@@ -298,8 +306,14 @@ def test_processor_handles_empty_pdf_gracefully(self, mock_extract):
 
         # Mock: first PDF returns no rows, second has data
         mock_extract.side_effect = [
-            ([], 1, None),  # Empty PDF with 1 page
-            ([{"Date": "01 Jan 2024", "Details": "Transaction 1"}], 1, None),
+            ExtractionResult(
+                transactions=[], page_count=1, iban=None, source_file=Path("empty.pdf"),
+                warnings=["credit card statement detected, skipped"],
+            ),
+            ExtractionResult(
+                transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]),
+                page_count=1, iban=None, source_file=Path("with_data.pdf"),
+            ),
         ]
 
         processor = create_test_processor(self.input_dir, self.output_dir)
diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py
index 2c108aa..8e89eca 100644
--- a/packages/parser-core/tests/test_processor_refactored_methods.py
+++ b/packages/parser-core/tests/test_processor_refactored_methods.py
@@ -16,6 +16,8 @@
     ExtractionConfig,
     ProcessorConfig,
 )
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.processor import BankStatementProcessor
 
 # Module-level defaults to avoid B008 (function call in defaults)
@@ -75,8 +77,14 @@ def test_process_all_pdfs_multiple_files(self, mock_extract):
 
                 # Mock extract_tables_from_pdf to return different data
                 mock_extract.side_effect = [
-                    ([{"Date": "01/01/23", "Details": "Test1"}], 5, None),
-                    ([{"Date": "02/01/23", "Details": "Test2"}], 3, None),
+                    ExtractionResult(
+                        transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test1"}]),
+                        page_count=5, iban=None, source_file=Path("file1.pdf"),
+                    ),
+                    ExtractionResult(
+                        transactions=dicts_to_transactions([{"Date": "02/01/23", "Details": "Test2"}]),
+                        page_count=3, iban=None, source_file=Path("file2.pdf"),
+                    ),
                 ]
 
                 all_rows, pages_read, pdf_ibans = processor._process_all_pdfs()
@@ -323,13 +331,14 @@ def test_run_uses_refactored_methods(self, mock_extract):
             )
 
             # Mock extract_tables_from_pdf
-            mock_extract.return_value = (
-                [
+            mock_extract.return_value = ExtractionResult(
+                transactions=dicts_to_transactions([
                     {"Date": "15/06/23", "Details": "Second", "Debit €": "200"},
                     {"Date": "01/01/23", "Details": "First", "Debit €": "100"},
-                ],
-                2,
-                None,
+                ]),
+                page_count=2,
+                iban=None,
+                source_file=Path("test.pdf"),
             )
 
             result = processor.run()
diff --git a/packages/parser-core/tests/test_transaction_data_no_blocking.py b/packages/parser-core/tests/test_transaction_data_no_blocking.py
index a53e10f..ae2596f 100644
--- a/packages/parser-core/tests/test_transaction_data_no_blocking.py
+++ b/packages/parser-core/tests/test_transaction_data_no_blocking.py
@@ -9,6 +9,8 @@
 import pytest
 
 from bankstatements_core.config.processor_config import ProcessorConfig
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 
 
 def create_test_processor(input_dir, output_dir):
@@ -77,11 +79,11 @@ def crop_side_effect(bbox):
             }
         )
 
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Should NOT be detected as credit card (VISA only in transaction)
-        assert iban == "IE48AIBK93408921459015"
-        assert page_count == 1
+        assert result.iban == "IE48AIBK93408921459015"
+        assert result.page_count == 1
         # PDF should be processed, not blocked
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
@@ -127,11 +129,11 @@ def crop_side_effect(bbox):
             }
         )
 
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Should NOT be detected as credit card
-        assert iban == "IE29AIBK93115212345678"
-        assert page_count == 1
+        assert result.iban == "IE29AIBK93115212345678"
+        assert result.page_count == 1
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_iban_in_transaction_does_not_override_header_iban(self, mock_pdfplumber):
@@ -176,11 +178,11 @@ def crop_side_effect(bbox):
             }
         )
 
-        rows, page_count, iban = extractor.extract(Path("/tmp/test.pdf"))
+        result = extractor.extract(Path("/tmp/test.pdf"))
 
         # Should return header IBAN, not transaction IBAN
-        assert iban == "IE48AIBK93408921459015"  # Normalized without spaces
-        assert page_count == 1
+        assert result.iban == "IE48AIBK93408921459015"  # Normalized without spaces
+        assert result.page_count == 1
 
     def test_processor_does_not_use_transaction_data_for_exclusion(self):
         """Test that processor only uses header data for exclusion decisions."""
@@ -197,17 +199,18 @@ def test_processor_does_not_use_transaction_data_for_exclusion(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                # Returns: rows with VISA in them, page count, IBAN from header
-                mock_extract.return_value = (
-                    [
+                # Returns: ExtractionResult with VISA in transactions, IBAN from header
+                mock_extract.return_value = ExtractionResult(
+                    transactions=dicts_to_transactions([
                         {
                             "Date": "11 Aug",
                             "Details": "MOBI CLICK VISA",
                             "Debit €": "100.00",
                         }
-                    ],
-                    1,
-                    "IE48AIBK93408921459015",  # IBAN from header
+                    ]),
+                    page_count=1,
+                    iban="IE48AIBK93408921459015",
+                    source_file=Path("test.pdf"),
                 )
 
                 processor = create_test_processor(input_dir, output_dir)
@@ -242,13 +245,14 @@ def test_all_card_brands_in_transactions_allowed(self):
             with patch(
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
-                mock_extract.return_value = (
-                    [
+                mock_extract.return_value = ExtractionResult(
+                    transactions=dicts_to_transactions([
                         {"Date": "11 Aug", "Details": detail}
                         for detail in transaction_details
-                    ],
-                    1,
-                    "IE48AIBK93408921459015",  # Has IBAN
+                    ]),
+                    page_count=1,
+                    iban="IE48AIBK93408921459015",
+                    source_file=Path("test.pdf"),
                 )
 
                 processor = create_test_processor(input_dir, output_dir)

From d925f029d3d270b5337b826958bd96ee1b448a57 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 07:58:09 +0000
Subject: [PATCH 03/14] test(23-02): add failing tests for ExtractionResult
 return from extract_from_pdf()

- Update 4 tuple-unpack assertion sites to use ExtractionResult field access
- Add assertIsInstance(result, ExtractionResult) to key assertions
- Tests now expect extract_from_pdf() to return ExtractionResult (not tuple)
---
 .../services/test_extraction_orchestrator.py  | 36 ++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/packages/parser-core/tests/services/test_extraction_orchestrator.py b/packages/parser-core/tests/services/test_extraction_orchestrator.py
index 24aaf9e..49d4c84 100644
--- a/packages/parser-core/tests/services/test_extraction_orchestrator.py
+++ b/packages/parser-core/tests/services/test_extraction_orchestrator.py
@@ -43,11 +43,12 @@ def test_extract_from_pdf_success(self, mock_extract):
             source_file=pdf_path,
         )
 
-        rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path)
+        result = self.orchestrator.extract_from_pdf(pdf_path)
 
-        self.assertEqual(len(rows), 1)
-        self.assertEqual(pages, 5)
-        self.assertEqual(iban, "IE12BOFI90000112345")
+        self.assertIsInstance(result, ExtractionResult)
+        self.assertEqual(len(result.transactions), 1)
+        self.assertEqual(result.page_count, 5)
+        self.assertEqual(result.iban, "IE12BOFI90000112345")
         mock_extract.assert_called_once()
 
     @patch(
@@ -69,12 +70,13 @@ def test_extract_from_pdf_with_forced_template(self, mock_extract):
             source_file=pdf_path,
         )
 
-        rows, pages, iban = self.orchestrator.extract_from_pdf(
+        result = self.orchestrator.extract_from_pdf(
             pdf_path, forced_template=mock_template
         )
 
-        self.assertEqual(len(rows), 1)
-        self.assertEqual(pages, 3)
+        self.assertIsInstance(result, ExtractionResult)
+        self.assertEqual(len(result.transactions), 1)
+        self.assertEqual(result.page_count, 3)
         # Verify extraction was called with the forced template
         call_args = mock_extract.call_args
         self.assertEqual(call_args[0][0], pdf_path)  # First positional arg
@@ -91,11 +93,12 @@ def test_extract_from_pdf_no_data(self, mock_extract):
             transactions=[], page_count=2, iban=None, source_file=pdf_path
         )
 
-        rows, pages, iban = self.orchestrator.extract_from_pdf(pdf_path)
+        result = self.orchestrator.extract_from_pdf(pdf_path)
 
-        self.assertEqual(len(rows), 0)
-        self.assertEqual(pages, 2)
-        self.assertIsNone(iban)
+        self.assertIsInstance(result, ExtractionResult)
+        self.assertEqual(len(result.transactions), 0)
+        self.assertEqual(result.page_count, 2)
+        self.assertIsNone(result.iban)
 
     @patch(
         "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
@@ -115,12 +118,13 @@ def test_extract_from_pdf_returns_rows_as_is(self, mock_extract):
             source_file=pdf_path,
         )
 
-        rows, _, _ = self.orchestrator.extract_from_pdf(pdf_path)
+        result = self.orchestrator.extract_from_pdf(pdf_path)
 
-        # Rows should be returned as-is (Filename is added by caller, not orchestrator)
-        self.assertEqual(len(rows), 2)
-        self.assertEqual(rows[0]["Details"], "Test1")
-        self.assertEqual(rows[1]["Details"], "Test2")
+        self.assertIsInstance(result, ExtractionResult)
+        # transactions are Transaction objects; check via to_dict() for field values
+        self.assertEqual(len(result.transactions), 2)
+        self.assertEqual(result.transactions[0].to_dict()["Details"], "Test1")
+        self.assertEqual(result.transactions[1].to_dict()["Details"], "Test2")
 
     def test_initialization_without_errors(self):
         """Test orchestrator initializes without errors."""

From c9a7e92c608a878c8d0d4f6c81c0dcb8928ebd46 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 07:58:36 +0000
Subject: [PATCH 04/14] feat(23-02): wire extract_from_pdf() to return
 ExtractionResult directly

- Add ExtractionResult import from bankstatements_core.domain
- Change return annotation from tuple[list, int, str | None] to ExtractionResult
- Remove tuple unpacking; pass ExtractionResult through unchanged
- Use result.iban for IBAN logging
- Update docstring Returns section
---
 .../services/extraction_orchestrator.py       | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py
index 4cd9b14..ba11d09 100644
--- a/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py
+++ b/packages/parser-core/src/bankstatements_core/services/extraction_orchestrator.py
@@ -12,6 +12,7 @@
     from bankstatements_core.domain.protocols.services import ITemplateDetector
 
 from bankstatements_core.config.processor_config import ExtractionConfig
+from bankstatements_core.domain import ExtractionResult
 from bankstatements_core.entitlements import Entitlements
 from bankstatements_core.extraction.extraction_facade import extract_tables_from_pdf
 from bankstatements_core.templates import TemplateDetector, TemplateRegistry
@@ -132,7 +133,7 @@ def extract_from_pdf(
         self,
         pdf_path: Path,
         forced_template: BankTemplate | None = None,
-    ) -> tuple[list, int, str | None]:
+    ) -> ExtractionResult:
         """Extract transactions from a single PDF file.
 
         Args:
@@ -141,10 +142,8 @@ def extract_from_pdf(
                            (overrides instance forced_template)
 
         Returns:
-            Tuple of (rows, page_count, iban):
-                - rows: List of transaction dictionaries
-                - page_count: Number of pages processed
-                - iban: Extracted IBAN or None
+            ExtractionResult containing extracted transactions, page count, IBAN,
+            and document-level warnings
         """
         # Determine which template to use
         template = forced_template or self._forced_template
@@ -158,7 +157,7 @@ def extract_from_pdf(
             logger.info(f"Using template: {template.name} for {pdf_path.name}")
 
         # Extract transactions using template
-        extraction_result = extract_tables_from_pdf(
+        result = extract_tables_from_pdf(
             pdf_path,
             self._config.table_top_y,
             self._config.table_bottom_y,
@@ -166,17 +165,14 @@ def extract_from_pdf(
             self._config.enable_dynamic_boundary,
             template=template,
         )
-        rows = [tx.to_dict() for tx in extraction_result.transactions]
-        page_count = extraction_result.page_count
-        iban = extraction_result.iban
 
         # Log IBAN if found
-        if iban:
+        if result.iban:
             logger.info(
-                f"IBAN extracted from {pdf_path.name}: {iban[:4]}****{iban[-4:]}"
+                f"IBAN extracted from {pdf_path.name}: {result.iban[:4]}****{result.iban[-4:]}"
             )
 
-        return rows, page_count, iban
+        return result
 
     def _detect_template(self, pdf_path: Path) -> BankTemplate | None:
         """Detect template for a PDF file.

From 3bfa787d20911bc347ba67694c81f1879e60f851 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 08:01:59 +0000
Subject: [PATCH 05/14] feat(23-02): wire process_all_pdfs() to return
 list[ExtractionResult]

- Add ExtractionResult and transactions_to_dicts imports
- Change return annotation from tuple[list[dict], int, dict[str,str]] to list[ExtractionResult]
- Replace tuple unpacking with result.page_count/iban/transactions field access
- Exclusion logic preserved using ExtractionResult fields
- filter_service.apply_all_filters() receives transactions_to_dicts(result.transactions)
- Append excluded results to list (page_count preserved for callers)
- Update processor._process_all_pdfs() to adapt list[ExtractionResult] -> 3-tuple
- Update test_processor test mock to use list[ExtractionResult] return value
---
 .../src/bankstatements_core/processor.py      | 19 +++++++++-
 .../services/pdf_processing_orchestrator.py   | 35 ++++++++++---------
 packages/parser-core/tests/test_processor.py  | 18 ++++++++--
 3 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py
index 2480ebf..381d8c6 100644
--- a/packages/parser-core/src/bankstatements_core/processor.py
+++ b/packages/parser-core/src/bankstatements_core/processor.py
@@ -282,11 +282,28 @@ def _process_all_pdfs(self) -> tuple[list[dict], int, dict[str, str]]:
         Returns:
             Tuple of (all_rows, total_pages_read, pdf_ibans)
         """
+        from bankstatements_core.domain.converters import transactions_to_dicts
+
         # Delegate to PDF processing orchestrator with recursive_scan setting
-        return self._pdf_orchestrator.process_all_pdfs(
+        results = self._pdf_orchestrator.process_all_pdfs(
             self.input_dir, recursive=self.recursive_scan
         )
 
+        # Adapt list[ExtractionResult] → (all_rows, pages_read, pdf_ibans) for caller
+        all_rows: list[dict] = []
+        pages_read = 0
+        pdf_ibans: dict[str, str] = {}
+        for result in results:
+            pages_read += result.page_count
+            # Skip excluded results (no IBAN, no transactions, but had pages)
+            if result.iban is None and len(result.transactions) == 0 and result.page_count > 0:
+                continue
+            if result.iban:
+                pdf_ibans[result.source_file.name] = result.iban
+            all_rows.extend(transactions_to_dicts(result.transactions))
+
+        return all_rows, pages_read, pdf_ibans
+
     def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]:
         """
         Sort transactions using the configured sorting strategy.
diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py
index e15f700..79cc0a0 100644
--- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py
+++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py
@@ -16,6 +16,8 @@
 from typing import TYPE_CHECKING, Any
 
 from bankstatements_core.config.processor_config import ExtractionConfig
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import transactions_to_dicts
 from bankstatements_core.entitlements import Entitlements
 
 if TYPE_CHECKING:
@@ -103,7 +105,7 @@ def __init__(
 
     def process_all_pdfs(
         self, input_dir: Path, recursive: bool = False
-    ) -> tuple[list[dict], int, dict[str, str]]:
+    ) -> list[ExtractionResult]:
         """Process all PDF files in the input directory.
 
         Args:
@@ -111,18 +113,16 @@ def process_all_pdfs(
             recursive: Whether to search subdirectories recursively
 
         Returns:
-            Tuple of (all_transactions, total_pages_read, pdf_ibans)
+            list[ExtractionResult] — one ExtractionResult per successfully processed PDF
 
         Examples:
             # orchestrator = PDFProcessingOrchestrator(config, columns, output_dir)
-            # result = orchestrator.process_all_pdfs(Path("input"))
-            # transactions, pages, ibans = result
-            # Processed {len(transactions)} transactions
+            # results = orchestrator.process_all_pdfs(Path("input"))
         """
         # Discover PDF files
         pdf_files = self.pdf_discovery.discover_pdfs(input_dir, recursive=recursive)
 
-        all_rows: list[dict] = []
+        results: list[ExtractionResult] = []
         pages_read = 0
         pdf_ibans: dict[str, str] = {}
         excluded_files: list[dict[str, Any]] = []
@@ -132,13 +132,11 @@ def process_all_pdfs(
             logger.info("Processing PDF %d of %d", idx, len(pdf_files))
 
             try:
-                rows, page_count, iban = self.extraction_orchestrator.extract_from_pdf(
-                    pdf
-                )
-                pages_read += page_count
+                result = self.extraction_orchestrator.extract_from_pdf(pdf)
+                pages_read += result.page_count
 
                 # Check if should be excluded (no IBAN and no data)
-                if iban is None and len(rows) == 0 and page_count > 0:
+                if result.iban is None and len(result.transactions) == 0 and result.page_count > 0:
                     excluded_files.append(
                         {
                             "filename": pdf.name,
@@ -148,7 +146,7 @@ def process_all_pdfs(
                                 "(likely credit card statement)"
                             ),
                             "timestamp": datetime.now().isoformat(),
-                            "pages": page_count,
+                            "pages": result.page_count,
                         }
                     )
                     logger.warning(
@@ -156,21 +154,24 @@ def process_all_pdfs(
                         idx,
                         pdf.name,
                     )
+                    results.append(result)
                     continue
 
                 # Store IBAN if found
-                if iban:
-                    pdf_ibans[pdf.name] = iban
+                if result.iban:
+                    pdf_ibans[pdf.name] = result.iban
 
                 # Apply filters to extracted rows
-                filtered_rows = self.filter_service.apply_all_filters(rows)
+                filtered_rows = self.filter_service.apply_all_filters(
+                    transactions_to_dicts(result.transactions)
+                )
 
-                all_rows.extend(filtered_rows)
                 logger.info(
                     "Successfully processed PDF %d: %d transactions extracted",
                     idx,
                     len(filtered_rows),
                 )
+                results.append(result)
 
             except (OSError, ValueError, AttributeError, KeyError) as e:
                 # Expected errors: file I/O, invalid PDF structure, missing attributes, missing keys
@@ -190,7 +191,7 @@ def process_all_pdfs(
         if excluded_files:
             self._save_excluded_files(excluded_files)
 
-        return all_rows, pages_read, pdf_ibans
+        return results
 
     def _save_ibans(self, pdf_ibans: dict[str, str]) -> None:
         """Save extracted IBANs to JSON file.
diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py
index 6c22ec2..eb8d05e 100644
--- a/packages/parser-core/tests/test_processor.py
+++ b/packages/parser-core/tests/test_processor.py
@@ -222,9 +222,21 @@ def test_run_with_mock_data(self):
         with patch(
             "bankstatements_core.services.pdf_processing_orchestrator.PDFProcessingOrchestrator.process_all_pdfs"
         ) as mock_process:
-            # Configure mock to return tuple of (transactions, pages_read, ibans)
-            all_transactions = mock_data_pdf1 + mock_data_pdf2
-            mock_process.return_value = (all_transactions, 2, {})
+            # Configure mock to return list[ExtractionResult]
+            mock_process.return_value = [
+                ExtractionResult(
+                    transactions=dicts_to_transactions(mock_data_pdf1),
+                    page_count=1,
+                    iban=None,
+                    source_file=pdf1,
+                ),
+                ExtractionResult(
+                    transactions=dicts_to_transactions(mock_data_pdf2),
+                    page_count=1,
+                    iban=None,
+                    source_file=pdf2,
+                ),
+            ]
 
             processor = create_test_processor(self.input_dir, self.output_dir)
             result = processor.run()

From a089d0263c3ff28734e213a2c18353c03440a0a1 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 08:05:36 +0000
Subject: [PATCH 06/14] feat(23-03): wire processor.py to return
 list[ExtractionResult]

- _process_all_pdfs() now returns list[ExtractionResult] directly (no adapter)
- run() loops over results, aggregates all_rows/pages_read/pdf_ibans via transactions_to_dicts()
- Added ExtractionResult and transactions_to_dicts imports at module level
- test_processor_refactored_methods.py updated: tuple-unpack assertions replaced with indexed list access
---
 .../src/bankstatements_core/processor.py      | 35 ++++++++-----------
 .../test_processor_refactored_methods.py      | 20 +++++------
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py
index 381d8c6..5da8dca 100644
--- a/packages/parser-core/src/bankstatements_core/processor.py
+++ b/packages/parser-core/src/bankstatements_core/processor.py
@@ -30,6 +30,8 @@
     TransactionProcessingOrchestrator,
 )
 from bankstatements_core.utils import is_date_column, to_float  # noqa: F401
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import transactions_to_dicts
 
 logger = logging.getLogger(__name__)
 
@@ -276,34 +278,17 @@ def _detect_duplicates(self, all_rows: list[dict]) -> tuple[list[dict], list[dic
         """
         return self._duplicate_service.detect_and_separate(all_rows)
 
-    def _process_all_pdfs(self) -> tuple[list[dict], int, dict[str, str]]:
+    def _process_all_pdfs(self) -> list[ExtractionResult]:
         """Process all PDF files in the input directory and extract transaction data.
 
         Returns:
-            Tuple of (all_rows, total_pages_read, pdf_ibans)
+            list[ExtractionResult] — one per processed PDF
         """
-        from bankstatements_core.domain.converters import transactions_to_dicts
-
         # Delegate to PDF processing orchestrator with recursive_scan setting
-        results = self._pdf_orchestrator.process_all_pdfs(
+        return self._pdf_orchestrator.process_all_pdfs(
             self.input_dir, recursive=self.recursive_scan
         )
 
-        # Adapt list[ExtractionResult] → (all_rows, pages_read, pdf_ibans) for caller
-        all_rows: list[dict] = []
-        pages_read = 0
-        pdf_ibans: dict[str, str] = {}
-        for result in results:
-            pages_read += result.page_count
-            # Skip excluded results (no IBAN, no transactions, but had pages)
-            if result.iban is None and len(result.transactions) == 0 and result.page_count > 0:
-                continue
-            if result.iban:
-                pdf_ibans[result.source_file.name] = result.iban
-            all_rows.extend(transactions_to_dicts(result.transactions))
-
-        return all_rows, pages_read, pdf_ibans
-
     def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]:
         """
         Sort transactions using the configured sorting strategy.
@@ -325,7 +310,15 @@ def run(self) -> dict:
         start_time = datetime.now()
 
         # Step 1: Extract transactions from all PDFs (delegated to orchestrator)
-        all_rows, pages_read, pdf_ibans = self._process_all_pdfs()
+        extraction_results = self._process_all_pdfs()  # list[ExtractionResult]
+        all_rows: list[dict] = []
+        pages_read = 0
+        pdf_ibans: dict[str, str] = {}
+        for result in extraction_results:
+            pages_read += result.page_count
+            if result.iban:
+                pdf_ibans[result.source_file.name] = result.iban
+            all_rows.extend(transactions_to_dicts(result.transactions))
         pdf_count = len(sorted(self.input_dir.glob("*.pdf")))
 
         # Step 2: Group transactions by IBAN (delegated to orchestrator)
diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py
index 8e89eca..92ff97d 100644
--- a/packages/parser-core/tests/test_processor_refactored_methods.py
+++ b/packages/parser-core/tests/test_processor_refactored_methods.py
@@ -87,14 +87,16 @@ def test_process_all_pdfs_multiple_files(self, mock_extract):
                     ),
                 ]
 
-                all_rows, pages_read, pdf_ibans = processor._process_all_pdfs()
+                results = processor._process_all_pdfs()
 
                 # Verify results
-                self.assertEqual(len(all_rows), 2)
-                self.assertEqual(pages_read, 8)  # 5 + 3
-                self.assertEqual(all_rows[0]["Details"], "Test1")
-                self.assertEqual(all_rows[1]["Details"], "Test2")
-                self.assertIsInstance(pdf_ibans, dict)
+                self.assertEqual(len(results), 2)
+                self.assertEqual(results[0].page_count, 5)
+                self.assertEqual(results[1].page_count, 3)
+                self.assertEqual(results[0].transactions[0].to_dict()["Details"], "Test1")
+                self.assertEqual(results[1].transactions[0].to_dict()["Details"], "Test2")
+                self.assertIsNone(results[0].iban)
+                self.assertIsNone(results[1].iban)
 
     @patch(
         "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
@@ -114,12 +116,10 @@ def test_process_all_pdfs_no_files(self, mock_extract):
             with patch.object(Path, "glob") as mock_glob:
                 mock_glob.return_value = []
 
-                all_rows, pages_read, pdf_ibans = processor._process_all_pdfs()
+                results = processor._process_all_pdfs()
 
                 # Should return empty results
-                self.assertEqual(len(all_rows), 0)
-                self.assertEqual(pages_read, 0)
-                self.assertEqual(len(pdf_ibans), 0)
+                self.assertEqual(len(results), 0)
                 mock_extract.assert_not_called()
 
     def test_sort_transactions_by_date_mixed_dates(self):

From 0334de62fce11430042fa36efc96e7f051ddcea0 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 09:59:31 +0000
Subject: [PATCH 07/14] test(24-01): add failing tests for word_utils pure
 functions

- group_words_by_y: 8 tests for Y-grouping with tolerance param
- assign_words_to_columns: 11 tests for strict/relaxed x-position assignment
- calculate_column_coverage: 7 tests for coverage fraction calculation
---
 .../tests/extraction/test_word_utils.py       | 274 ++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 packages/parser-core/tests/extraction/test_word_utils.py

diff --git a/packages/parser-core/tests/extraction/test_word_utils.py b/packages/parser-core/tests/extraction/test_word_utils.py
new file mode 100644
index 0000000..b92efe8
--- /dev/null
+++ b/packages/parser-core/tests/extraction/test_word_utils.py
@@ -0,0 +1,274 @@
+"""TDD tests for extraction/word_utils.py.
+
+Written BEFORE implementation (RED phase). All tests must fail with ImportError
+until word_utils.py is created.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from bankstatements_core.extraction.word_utils import (
+    assign_words_to_columns,
+    calculate_column_coverage,
+    group_words_by_y,
+)
+
+# ---------------------------------------------------------------------------
+# Shared fixtures / helpers
+# ---------------------------------------------------------------------------
+
+COLUMNS = {
+    "Date": (0, 50),
+    "Details": (50, 200),
+    "Debit": (200, 250),
+    "Credit": (250, 300),
+    "Balance": (300, 350),
+}
+
+
+def _word(text: str, x0: float, top: float, x1: float | None = None) -> dict:
+    w: dict = {"text": text, "x0": x0, "top": top}
+    if x1 is not None:
+        w["x1"] = x1
+    return w
+
+
+# ---------------------------------------------------------------------------
+# TestGroupWordsByY
+# ---------------------------------------------------------------------------
+
+
+class TestGroupWordsByY:
+    def test_basic_grouping_two_distinct_y_values(self):
+        """Words with different top values produce separate groups."""
+        words = [
+            _word("A", 10, 350.0),
+            _word("B", 60, 370.0),
+        ]
+        result = group_words_by_y(words)
+        assert 350.0 in result
+        assert 370.0 in result
+        assert len(result[350.0]) == 1
+        assert result[350.0][0]["text"] == "A"
+        assert len(result[370.0]) == 1
+        assert result[370.0][0]["text"] == "B"
+
+    def test_fractional_tops_round_to_integer_key(self):
+        """Words with top=100.3 and top=100.7 both round to 100.0 (one group)."""
+        words = [
+            _word("A", 10, 100.3),
+            _word("B", 60, 100.7),
+        ]
+        result = group_words_by_y(words)
+        assert len(result) == 1
+        assert 100.0 in result
+        assert len(result[100.0]) == 2
+
+    def test_tolerance_parameter_accepted(self):
+        """tolerance parameter is accepted without raising TypeError."""
+        words = [_word("A", 10, 200.0)]
+        # Should not raise even though tolerance does not change behaviour
+        result = group_words_by_y(words, tolerance=5.0)
+        assert 200.0 in result
+
+    def test_tolerance_does_not_change_grouping(self):
+        """Grouping is by round(top, 0) regardless of tolerance value."""
+        words = [
+            _word("A", 10, 100.3),
+            _word("B", 60, 100.7),
+        ]
+        result_default = group_words_by_y(words)
+        result_large_tolerance = group_words_by_y(words, tolerance=50.0)
+        assert set(result_default.keys()) == set(result_large_tolerance.keys())
+
+    def test_no_table_top_y_filtering(self):
+        """Words with any top value are included -- no table_top_y minimum."""
+        words = [
+            _word("A", 10, 0.0),
+            _word("B", 60, 5.0),
+            _word("C", 60, 999.0),
+        ]
+        result = group_words_by_y(words)
+        assert len(result) == 3
+
+    def test_empty_list_returns_empty_dict(self):
+        """Empty word list produces an empty dict."""
+        result = group_words_by_y([])
+        assert result == {}
+
+    def test_multiple_words_same_y_grouped_together(self):
+        """Multiple words at the same rounded Y appear in one group."""
+        words = [
+            _word("01/01", 5, 320.0),
+            _word("Tesco", 60, 320.0),
+            _word("12.50", 210, 320.0),
+        ]
+        result = group_words_by_y(words)
+        assert len(result) == 1
+        assert len(result[320.0]) == 3
+
+    def test_returned_dict_keys_are_float(self):
+        """All keys in the returned dict are floats (not ints)."""
+        words = [_word("A", 10, 200.0)]
+        result = group_words_by_y(words)
+        for key in result:
+            assert isinstance(key, float), f"Expected float key, got {type(key)}"
+
+
+# ---------------------------------------------------------------------------
+# TestAssignWordsToColumns
+# ---------------------------------------------------------------------------
+
+
+class TestAssignWordsToColumns:
+    def test_relaxed_check_by_default(self):
+        """With strict_rightmost=False (default), all columns use xmin <= x0 < xmax."""
+        words = [_word("Hello", 60, 320.0, x1=90)]
+        result = assign_words_to_columns(words, COLUMNS)
+        assert result["Details"] == "Hello"
+
+    def test_all_columns_have_empty_string_keys(self):
+        """Returned dict contains all column names, even those without data."""
+        result = assign_words_to_columns([], COLUMNS)
+        assert set(result.keys()) == set(COLUMNS.keys())
+        for val in result.values():
+            assert val == ""
+
+    def test_empty_words_returns_empty_row(self):
+        """Empty words list returns dict with all column keys mapped to empty string."""
+        result = assign_words_to_columns([], COLUMNS)
+        assert result == {col: "" for col in COLUMNS}
+
+    def test_strict_rightmost_false_accepts_word_extending_beyond_rightmost(self):
+        """Without strict_rightmost, a word whose x1 extends past xmax is still placed."""
+        # x0=310 is inside Balance (300-350), x1=360 extends beyond xmax=350
+        # relaxed check: xmin(300) <= x0(310) < xmax(350) → placed
+        words = [_word("1234.56", 310, 320.0, x1=360)]
+        result = assign_words_to_columns(words, COLUMNS)
+        assert result["Balance"] == "1234.56"
+
+    def test_strict_rightmost_true_rejects_word_extending_beyond_rightmost(self):
+        """With strict_rightmost=True, a word whose x1 > xmax of rightmost is rejected."""
+        # x0=310 inside Balance (300-350), x1=360 extends beyond xmax=350
+        # strict check: xmin(300) <= x0(310) AND x1(360) <= xmax(350) → FAILS
+        words = [_word("1234.56", 310, 320.0, x1=360)]
+        result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True)
+        assert result["Balance"] == ""
+
+    def test_strict_rightmost_true_accepts_word_contained_in_rightmost(self):
+        """With strict_rightmost=True, a word fully within rightmost column is accepted."""
+        # x0=310, x1=345 both within Balance (300-350)
+        words = [_word("100.00", 310, 320.0, x1=345)]
+        result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True)
+        assert result["Balance"] == "100.00"
+
+    def test_strict_rightmost_does_not_affect_non_rightmost_columns(self):
+        """With strict_rightmost=True, non-rightmost columns still use relaxed check."""
+        # Details word x0=60 x1=999 (way beyond Details xmax=200)
+        # relaxed check: xmin(50) <= x0(60) < xmax(200) → placed (x1 ignored)
+        words = [_word("Tesco", 60, 320.0, x1=999)]
+        result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True)
+        assert result["Details"] == "Tesco"
+
+    def test_x1_fallback_when_key_missing(self):
+        """When x1 key is absent, fallback x1 = x0 + max(len(text)*3, 10) is used."""
+        # "Hi" has len=2; fallback: x0(60) + max(2*3, 10) = 60 + 10 = 70
+        # With strict_rightmost=True, Balance (300-350): x0=310, x1=320 (both inside) → placed
+        words = [{"text": "Hi", "x0": 310, "top": 320.0}]  # no x1 key
+        result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True)
+        # fallback x1 = 310 + max(len("Hi")*3, 10) = 310 + 10 = 320 <= 350 → accepted
+        assert result["Balance"] == "Hi"
+
+    def test_x1_fallback_strict_rightmost_rejects_with_long_text(self):
+        """Long text without x1 causes large fallback x1, which fails strict check."""
+        # "LongTextHere" has len=12; fallback: x0(310) + max(12*3, 10) = 310 + 36 = 346 <= 350
+        # Actually 346 <= 350 passes. Use a very long text.
+        # "AReallylongtextstring" has len=21; fallback: 310 + max(21*3, 10) = 310 + 63 = 373 > 350 → rejected
+        long_text = "AReallylongtextstring"  # len=21
+        words = [{"text": long_text, "x0": 310, "top": 320.0}]  # no x1
+        result = assign_words_to_columns(words, COLUMNS, strict_rightmost=True)
+        assert result["Balance"] == ""
+
+    def test_multiple_words_concatenated_with_space(self):
+        """Multiple words in the same column are concatenated with space, then stripped."""
+        words = [
+            _word("John", 60, 320.0, x1=90),
+            _word("Smith", 95, 320.0, x1=130),
+        ]
+        result = assign_words_to_columns(words, COLUMNS)
+        assert result["Details"] == "John Smith"
+
+    def test_values_are_stripped(self):
+        """Final values have leading/trailing whitespace stripped."""
+        words = [_word("Hi", 60, 320.0, x1=90)]
+        result = assign_words_to_columns(words, COLUMNS)
+        assert result["Details"] == "Hi"
+        assert not result["Details"].endswith(" ")
+
+    def test_word_outside_all_columns_not_assigned(self):
+        """A word whose x0 falls outside every column boundary is not placed anywhere."""
+        # x0=500 is beyond all column boundaries (max xmax=350)
+        words = [_word("Orphan", 500, 320.0, x1=530)]
+        result = assign_words_to_columns(words, COLUMNS)
+        for val in result.values():
+            assert val == ""
+
+
+# ---------------------------------------------------------------------------
+# TestCalculateColumnCoverage
+# ---------------------------------------------------------------------------
+
+
+class TestCalculateColumnCoverage:
+    def test_all_columns_have_data_returns_one(self):
+        """When every column has data in at least one row, coverage is 1.0."""
+        rows = [
+            {"Date": "01/01", "Details": "Tesco", "Debit": "12.50", "Credit": "", "Balance": "100.00"},
+            {"Date": "", "Details": "AIB", "Debit": "", "Credit": "50.00", "Balance": ""},
+        ]
+        result = calculate_column_coverage(rows, COLUMNS)
+        assert result == 1.0
+
+    def test_partial_coverage(self):
+        """2 of 4 columns having data returns 0.5."""
+        two_col_columns = {"Date": (0, 50), "Details": (50, 200), "Debit": (200, 250), "Credit": (250, 300)}
+        rows = [
+            {"Date": "01/01", "Details": "Tesco", "Debit": "", "Credit": ""},
+        ]
+        result = calculate_column_coverage(rows, two_col_columns)
+        assert result == 0.5
+
+    def test_empty_rows_returns_zero(self):
+        """No rows means no coverage."""
+        result = calculate_column_coverage([], COLUMNS)
+        assert result == 0.0
+
+    def test_empty_columns_returns_zero(self):
+        """No columns means no coverage."""
+        rows = [{"Date": "01/01"}]
+        result = calculate_column_coverage(rows, {})
+        assert result == 0.0
+
+    def test_whitespace_only_values_not_counted(self):
+        """Values that are only whitespace are not counted as data."""
+        rows = [
+            {"Date": "   ", "Details": "\t", "Debit": "", "Credit": "", "Balance": ""},
+        ]
+        result = calculate_column_coverage(rows, COLUMNS)
+        assert result == 0.0
+
+    def test_single_row_single_column_has_data(self):
+        """1 of 5 columns having data in one row returns 0.2."""
+        rows = [{"Date": "", "Details": "Tesco", "Debit": "", "Credit": "", "Balance": ""}]
+        result = calculate_column_coverage(rows, COLUMNS)
+        assert result == pytest.approx(0.2)
+
+    def test_same_column_in_multiple_rows_counts_once(self):
+        """The same column name is counted once even if it has data in multiple rows."""
+        rows = [
+            {"Date": "01/01", "Details": "", "Debit": "", "Credit": "", "Balance": ""},
+            {"Date": "02/01", "Details": "", "Debit": "", "Credit": "", "Balance": ""},
+        ]
+        result = calculate_column_coverage(rows, COLUMNS)
+        assert result == pytest.approx(0.2)

From aecb9d34de49ef37074ce204eac2595235e1ba71 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:03:30 +0000
Subject: [PATCH 08/14] feat(24-01): implement word_utils.py with three pure
 standalone functions

- group_words_by_y: groups words by round(top, 0); tolerance param accepted for API compat
- assign_words_to_columns: relaxed/strict rightmost column boundary check
- calculate_column_coverage: returns fraction of columns with non-empty data across rows
- Fix test rounding: round(100.7, 0) is 101.0 in Python banker rounding
---
 .../extraction/word_utils.py                  | 128 ++++++++++++++++++
 .../tests/extraction/test_word_utils.py       |  11 +-
 2 files changed, 136 insertions(+), 3 deletions(-)
 create mode 100644 packages/parser-core/src/bankstatements_core/extraction/word_utils.py

diff --git a/packages/parser-core/src/bankstatements_core/extraction/word_utils.py b/packages/parser-core/src/bankstatements_core/extraction/word_utils.py
new file mode 100644
index 0000000..95cbff0
--- /dev/null
+++ b/packages/parser-core/src/bankstatements_core/extraction/word_utils.py
@@ -0,0 +1,128 @@
+"""Pure standalone word-processing utility functions for PDF table extraction.
+
+These functions consolidate duplicated private-method logic from:
+- extraction/boundary_detector.py (_group_words_by_y, _build_row_from_words,
+  _calculate_column_coverage)
+- extraction/row_builder.py (inline word-grouping and column-assignment loop)
+- services/header_detection.py (_group_words_by_y_coordinate)
+- services/page_validation.py (calculate_column_coverage -- canonical version)
+
+This module is intentionally NOT re-exported from extraction/__init__.py.
+Callers import directly:
+    from bankstatements_core.extraction.word_utils import group_words_by_y
+"""
+
+from __future__ import annotations
+
+
+def group_words_by_y(
+    words: list[dict],
+    tolerance: float = 1.0,
+) -> dict[float, list[dict]]:
+    """Group words by Y-coordinate (rounded to the nearest integer).
+
+    Args:
+        words: List of word dicts, each containing at least ``"top"`` and
+            ``"text"`` keys.
+        tolerance: Accepted for API compatibility but the current implementation
+            always rounds to 0 decimal places (``round(w["top"], 0)``),
+            which is the behaviour used by all original callers.  Changing
+            tolerance to a non-zero value does **not** affect the grouping.
+
+    Returns:
+        A dict mapping ``float`` Y-coordinate keys to the list of word dicts
+        at that rounded Y position.  Keys are the result of
+        ``round(w["top"], 0)``, which always produces a ``float`` in Python 3.
+
+    Note:
+        This function applies **no** ``table_top_y`` filtering.  Callers that
+        need to exclude words above the table boundary should pre-filter the
+        word list before calling this function.
+    """
+    lines: dict[float, list[dict]] = {}
+    for w in words:
+        y_key = round(w["top"], 0)
+        lines.setdefault(y_key, []).append(w)
+    return lines
+
+
+def assign_words_to_columns(
+    words: list[dict],
+    columns: dict[str, tuple[int | float, int | float]],
+    strict_rightmost: bool = False,
+) -> dict[str, str]:
+    """Assign words to named columns based on their x-position.
+
+    Args:
+        words: List of word dicts at a single Y-coordinate.  Each dict must
+            contain ``"text"`` and ``"x0"`` keys.  ``"x1"`` is optional; if
+            absent the right edge is estimated as
+            ``x0 + max(len(text) * 3, 10)``.
+        columns: Ordered mapping of column name to ``(xmin, xmax)`` boundary
+            tuple.  The last entry is treated as the *rightmost* column when
+            ``strict_rightmost=True``.
+        strict_rightmost: When ``True``, the rightmost column uses a strict
+            containment check (``xmin <= x0 and x1 <= xmax``) to prevent
+            footer or overflow text from bleeding into balance/amount columns.
+            When ``False`` (default), all columns use a relaxed left-edge
+            check (``xmin <= x0 < xmax``).
+
+    Returns:
+        A dict mapping every column name to its accumulated text value,
+        stripped of leading/trailing whitespace.  Columns with no matching
+        words map to an empty string.
+    """
+    column_names = list(columns.keys())
+    rightmost = column_names[-1] if column_names else None
+    row: dict[str, str] = dict.fromkeys(columns, "")
+
+    for w in words:
+        x0: float = w["x0"]
+        text: str = w["text"]
+        x1: float = w.get("x1", x0 + max(len(text) * 3, 10))
+
+        for col, (xmin, xmax) in columns.items():
+            if strict_rightmost and col == rightmost:
+                if xmin <= x0 and x1 <= xmax:
+                    row[col] += text + " "
+                    break
+            else:
+                if xmin <= x0 < xmax:
+                    row[col] += text + " "
+                    break
+
+    return {k: v.strip() for k, v in row.items()}
+
+
+def calculate_column_coverage(
+    rows: list[dict],
+    columns: dict[str, tuple[int | float, int | float]],
+) -> float:
+    """Return the fraction of columns that have non-empty data across all rows.
+
+    A column is counted as having data if at least one row contains a
+    non-empty, non-whitespace string for that column name.
+
+    Args:
+        rows: List of row dicts, each mapping column names to string values.
+        columns: Column definitions mapping name to ``(xmin, xmax)``.  Used
+            to determine the full set of column names and the denominator for
+            the coverage fraction.
+
+    Returns:
+        A float in ``[0.0, 1.0]``.  Returns ``0.0`` when ``rows`` is empty
+        or ``columns`` is empty.
+
+    Note:
+        Canonical source: ``PageValidationService.calculate_column_coverage``
+        in ``services/page_validation.py`` (L116–L133).
+    """
+    if not rows or not columns:
+        return 0.0
+    column_names = list(columns.keys())
+    columns_with_data: set[str] = set()
+    for row in rows:
+        for col_name in column_names:
+            if row.get(col_name, "").strip():
+                columns_with_data.add(col_name)
+    return len(columns_with_data) / len(column_names)
diff --git a/packages/parser-core/tests/extraction/test_word_utils.py b/packages/parser-core/tests/extraction/test_word_utils.py
index b92efe8..79f14a4 100644
--- a/packages/parser-core/tests/extraction/test_word_utils.py
+++ b/packages/parser-core/tests/extraction/test_word_utils.py
@@ -55,10 +55,15 @@ def test_basic_grouping_two_distinct_y_values(self):
         assert result[370.0][0]["text"] == "B"
 
     def test_fractional_tops_round_to_integer_key(self):
-        """Words with top=100.3 and top=100.7 both round to 100.0 (one group)."""
+        """Words with top values that round to the same integer produce one group.
+
+        round(100.3, 0) == 100.0 and round(100.4, 0) == 100.0 → one group.
+        Note: round(100.7, 0) == 101.0 (banker's rounding), producing a separate
+        group.  The implementation uses Python's built-in round(top, 0).
+        """
         words = [
             _word("A", 10, 100.3),
-            _word("B", 60, 100.7),
+            _word("B", 60, 100.4),
         ]
         result = group_words_by_y(words)
         assert len(result) == 1
@@ -76,7 +81,7 @@ def test_tolerance_does_not_change_grouping(self):
         """Grouping is by round(top, 0) regardless of tolerance value."""
         words = [
             _word("A", 10, 100.3),
-            _word("B", 60, 100.7),
+            _word("B", 60, 100.4),
         ]
         result_default = group_words_by_y(words)
         result_large_tolerance = group_words_by_y(words, tolerance=50.0)

From 69046c051f31f6871601ef5da357069a3500d2e2 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:12:53 +0000
Subject: [PATCH 09/14] feat(24-02): migrate all callers to word_utils, delete
 private originals

- boundary_detector.py: import group_words_by_y/assign_words_to_columns/calculate_column_coverage; add pre-filter before group_words_by_y; delete _group_words_by_y, _build_row_from_words, _calculate_column_coverage
- row_builder.py: import assign_words_to_columns/group_words_by_y; replace inline loops with word_utils calls; remove _rightmost_column/_column_names attrs
- header_detection.py: import group_words_by_y; delete _group_words_by_y_coordinate private method
- page_validation.py: import calculate_column_coverage as _calculate_column_coverage_impl; make calculate_column_coverage a thin wrapper
- content_density.py: import assign_words_to_columns; replace inline column-assignment loop
---
 .../extraction/boundary_detector.py           | 80 +++----------------
 .../extraction/row_builder.py                 | 31 +------
 .../services/content_density.py               | 16 +---
 .../services/header_detection.py              | 28 +------
 .../services/page_validation.py               | 18 ++---
 5 files changed, 28 insertions(+), 145 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py b/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py
index 997cc63..7c4a8ef 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py
@@ -13,6 +13,11 @@
     RowClassifier,
     create_row_classifier_chain,
 )
+from bankstatements_core.extraction.word_utils import (
+    assign_words_to_columns,
+    calculate_column_coverage,
+    group_words_by_y,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -97,7 +102,8 @@ def detect_boundary(self, words: list[dict]) -> int:
             return self.fallback_bottom_y
 
         # Phase 0: Group words and find transaction positions
-        lines = self._group_words_by_y(words)
+        filtered_words = [w for w in words if w["top"] >= self.table_top_y]
+        lines = group_words_by_y(filtered_words)
         if not lines:
             return self.fallback_bottom_y
 
@@ -156,23 +162,6 @@ def detect_boundary(self, words: list[dict]) -> int:
         logger.debug("No clear table end detected - using fallback boundary")
         return self.fallback_bottom_y
 
-    def _group_words_by_y(self, words: list[dict]) -> dict[float, list[dict]]:
-        """
-        Group words by Y-coordinate (rounded).
-
-        Args:
-            words: List of word dictionaries with 'top', 'x0', 'text' keys
-
-        Returns:
-            Dictionary mapping Y-coordinate to list of words at that Y
-        """
-        lines: dict[float, list[dict]] = {}
-        for w in words:
-            if w["top"] >= self.table_top_y:
-                y_key = round(w["top"], 0)
-                lines.setdefault(y_key, []).append(w)
-        return lines
-
     def _find_transaction_positions(
         self, lines: dict[float, list[dict]], sorted_y_coords: list[float]
     ) -> tuple[list[float], float | None]:
@@ -190,7 +179,7 @@ def _find_transaction_positions(
         last_transaction_y = None
 
         for y_coord in sorted_y_coords:
-            row = self._build_row_from_words(lines[y_coord])
+            row = assign_words_to_columns(lines[y_coord], self.columns)
 
             if any(row.values()):
                 row_type = self._row_classifier.classify(row, self.columns)
@@ -278,7 +267,7 @@ def _detect_by_spatial_gaps(
                 post_gap_transactions = 0
 
                 for y_coord in post_gap_y_coords:
-                    row = self._build_row_from_words(lines[y_coord])
+                    row = assign_words_to_columns(lines[y_coord], self.columns)
 
                     if (
                         any(row.values())
@@ -320,11 +309,11 @@ def _detect_by_structure_breakdown(
             if last_transaction_y is not None and y_coord <= last_transaction_y:
                 continue
 
-            row = self._build_row_from_words(lines[y_coord])
+            row = assign_words_to_columns(lines[y_coord], self.columns)
 
             if any(row.values()):
                 # Check if this row has any structure (data in expected columns)
-                column_coverage = self._calculate_column_coverage([row])
+                column_coverage = calculate_column_coverage([row], self.columns)
                 if column_coverage < 0.3:  # Less than 30% of columns have data
                     structure_breakdown_count += 1
                 else:
@@ -369,7 +358,7 @@ def _detect_by_consecutive_non_transactions(
             if last_transaction_y is not None and y_coord <= last_transaction_y:
                 continue
 
-            row = self._build_row_from_words(lines[y_coord])
+            row = assign_words_to_columns(lines[y_coord], self.columns)
 
             if any(row.values()):
                 row_type = self._row_classifier.classify(row, self.columns)
@@ -391,48 +380,3 @@ def _detect_by_consecutive_non_transactions(
                         )
 
         return None
-
-    def _build_row_from_words(self, words: list[dict]) -> dict[str, str]:
-        """
-        Build a row dictionary from words by assigning to columns.
-
-        Args:
-            words: List of words at the same Y-coordinate
-
-        Returns:
-            Dictionary mapping column names to concatenated text
-        """
-        row = dict.fromkeys(self.columns, "")
-
-        for w in words:
-            x0 = w["x0"]
-            text = w["text"]
-            for col, (xmin, xmax) in self.columns.items():
-                if xmin <= x0 < xmax:
-                    row[col] += text + " "
-                    break
-
-        return {k: v.strip() for k, v in row.items()}
-
-    def _calculate_column_coverage(self, rows: list[dict[str, str]]) -> float:
-        """
-        Calculate what percentage of columns have data in the given rows.
-
-        Args:
-            rows: List of row dictionaries
-
-        Returns:
-            Float between 0.0 and 1.0 representing column coverage
-        """
-        if not rows:
-            return 0.0
-
-        total_columns = len(self.columns)
-        columns_with_data = set()
-
-        for row in rows:
-            for col_name, value in row.items():
-                if value and value.strip():
-                    columns_with_data.add(col_name)
-
-        return len(columns_with_data) / total_columns if total_columns > 0 else 0.0
diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py
index 6034a50..47fb246 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py
@@ -9,6 +9,8 @@
 import logging
 from typing import TYPE_CHECKING
 
+from bankstatements_core.extraction.word_utils import assign_words_to_columns, group_words_by_y
+
 if TYPE_CHECKING:
     from bankstatements_core.extraction.row_classifiers import RowClassifier
 
@@ -31,8 +33,6 @@ def __init__(
     ) -> None:
         self._columns = columns
         self._row_classifier = row_classifier
-        self._column_names = list(columns.keys())
-        self._rightmost_column = self._column_names[-1] if self._column_names else None
 
     def build_rows(self, words: list[dict]) -> list[dict]:
         """Group words by Y position, assign to columns, return transaction/continuation rows.
@@ -43,35 +43,12 @@ def build_rows(self, words: list[dict]) -> list[dict]:
         Returns:
             List of row dictionaries classified as 'transaction' or 'continuation'
         """
-        lines: dict[float, list[dict]] = {}
-        for w in words:
-            y_key = round(w["top"], 0)
-            lines.setdefault(y_key, []).append(w)
-
+        lines = group_words_by_y(words)
         page_rows = []
         for _, line_words in sorted(lines.items()):
-            row = dict.fromkeys(self._columns, "")
-
-            for w in line_words:
-                x0 = w["x0"]
-                x1 = w.get("x1", x0 + max(len(w["text"]) * 3, 10))
-                text = w["text"]
-
-                for col, (xmin, xmax) in self._columns.items():
-                    if col == self._rightmost_column:
-                        if xmin <= x0 and x1 <= xmax:
-                            row[col] += text + " "
-                            break
-                    else:
-                        if xmin <= x0 < xmax:
-                            row[col] += text + " "
-                            break
-
-            row = {k: v.strip() for k, v in row.items()}
-
+            row = assign_words_to_columns(line_words, self._columns, strict_rightmost=True)
             if any(row.values()):
                 row_type = self._row_classifier.classify(row, self._columns)
                 if row_type in ["transaction", "continuation"]:
                     page_rows.append(row)
-
         return page_rows
diff --git a/packages/parser-core/src/bankstatements_core/services/content_density.py b/packages/parser-core/src/bankstatements_core/services/content_density.py
index 59ea329..f0d1afb 100644
--- a/packages/parser-core/src/bankstatements_core/services/content_density.py
+++ b/packages/parser-core/src/bankstatements_core/services/content_density.py
@@ -8,6 +8,8 @@
 
 import logging
 
+from bankstatements_core.extraction.word_utils import assign_words_to_columns
+
 logger = logging.getLogger(__name__)
 
 
@@ -67,19 +69,7 @@ def analyze_content_density(
 
             for y_coord in window_y_coords:
                 # Form row from words at this Y coordinate
-                row = dict.fromkeys(columns, "")
-
-                for w in word_groups[y_coord]:
-                    x0 = w["x0"]
-                    text = w["text"]
-
-                    for col, (xmin, xmax) in columns.items():
-                        if xmin <= x0 < xmax:
-                            row[col] += text + " "
-                            break
-
-                # Normalize row data
-                row = {k: v.strip() for k, v in row.items()}
+                row = assign_words_to_columns(word_groups[y_coord], columns)
 
                 if any(row.values()):  # Only count non-empty rows
                     total_rows += 1
diff --git a/packages/parser-core/src/bankstatements_core/services/header_detection.py b/packages/parser-core/src/bankstatements_core/services/header_detection.py
index 988e782..d47c13b 100644
--- a/packages/parser-core/src/bankstatements_core/services/header_detection.py
+++ b/packages/parser-core/src/bankstatements_core/services/header_detection.py
@@ -10,6 +10,8 @@
 import logging
 from typing import TYPE_CHECKING
 
+from bankstatements_core.extraction.word_utils import group_words_by_y
+
 if TYPE_CHECKING:
     import pdfplumber
 
@@ -131,7 +133,7 @@ def detect_headers(
             return False
 
         # Group words by Y coordinate to find potential header rows
-        rows_by_y = self._group_words_by_y_coordinate(words)
+        rows_by_y = group_words_by_y(words)
 
         # Check top few rows for header-like content
         top_rows = sorted(rows_by_y.keys())[:max_rows_to_check]
@@ -162,30 +164,6 @@ def detect_headers(
         )
         return False
 
-    def _group_words_by_y_coordinate(
-        self, words: list[dict], tolerance: float = 1.0
-    ) -> dict[float, list[dict]]:
-        """Group words by their Y coordinate (row position).
-
-        Words with Y coordinates within `tolerance` pixels are grouped together
-        as being on the same row. Uses rounding to group words efficiently.
-
-        Args:
-            words: List of word dictionaries
-            tolerance: Y coordinate tolerance for grouping (pixels)
-
-        Returns:
-            Dictionary mapping Y coordinates to lists of words
-        """
-        rows_by_y: dict[float, list[dict]] = {}
-
-        for word in words:
-            # Round Y coordinate to nearest integer for grouping
-            y_key = round(word.get("top", 0), 0)
-            rows_by_y.setdefault(y_key, []).append(word)
-
-        return rows_by_y
-
     def check_page_for_headers(
         self,
         page: pdfplumber.page.Page,
diff --git a/packages/parser-core/src/bankstatements_core/services/page_validation.py b/packages/parser-core/src/bankstatements_core/services/page_validation.py
index 5347886..db60457 100644
--- a/packages/parser-core/src/bankstatements_core/services/page_validation.py
+++ b/packages/parser-core/src/bankstatements_core/services/page_validation.py
@@ -9,6 +9,9 @@
 import logging
 
 from bankstatements_core.domain.column_types import get_type_as_string
+from bankstatements_core.extraction.word_utils import (
+    calculate_column_coverage as _calculate_column_coverage_impl,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -118,6 +121,8 @@ def calculate_column_coverage(
     ) -> float:
         """Calculate what percentage of columns have meaningful data.
 
+        Delegates to word_utils.calculate_column_coverage for the canonical implementation.
+
         Args:
             rows: List of extracted rows
             columns: Column definitions
@@ -132,18 +137,7 @@ def calculate_column_coverage(
             >>> service.calculate_column_coverage(rows, columns)
             0.666...
         """
-        if not rows or not columns:
-            return 0.0
-
-        column_names = list(columns.keys())
-        columns_with_data = set()
-
-        for row in rows:
-            for col_name in column_names:
-                if row.get(col_name, "").strip():
-                    columns_with_data.add(col_name)
-
-        return len(columns_with_data) / len(column_names)
+        return _calculate_column_coverage_impl(rows, columns)
 
     def has_column_type(
         self,

From feed87a80d0f591cf25a090f51dc84bb3c257440 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:14:31 +0000
Subject: [PATCH 10/14] refactor(24-02): remove private-method test calls,
 covered by test_word_utils

- test_boundary_detector.py: delete test_group_words_by_y, test_group_words_filters_above_table_top, test_build_row_from_words, test_calculate_column_coverage_full, test_calculate_column_coverage_partial, test_calculate_column_coverage_empty
- test_header_detection.py: delete test_group_words_by_y_coordinate
- All deleted tests now covered by test_word_utils.py TestGroupWordsByY, TestAssignWordsToColumns, TestCalculateColumnCoverage
---
 .../extraction/test_boundary_detector.py      | 88 -------------------
 .../tests/services/test_header_detection.py   | 17 ----
 2 files changed, 105 deletions(-)

diff --git a/packages/parser-core/tests/extraction/test_boundary_detector.py b/packages/parser-core/tests/extraction/test_boundary_detector.py
index d848f2a..27cbb90 100644
--- a/packages/parser-core/tests/extraction/test_boundary_detector.py
+++ b/packages/parser-core/tests/extraction/test_boundary_detector.py
@@ -61,36 +61,6 @@ def test_no_transactions_returns_fallback(self):
         boundary = detector.detect_boundary(words)
         assert boundary == 720
 
-    def test_group_words_by_y(self):
-        """Test grouping words by Y-coordinate."""
-        detector = TableBoundaryDetector(
-            columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300
-        )
-        words = [
-            {"text": "Word1", "x0": 30, "top": 350},
-            {"text": "Word2", "x0": 60, "top": 350},
-            {"text": "Word3", "x0": 30, "top": 370},
-        ]
-        lines = detector._group_words_by_y(words)
-        assert len(lines) == 2
-        assert 350.0 in lines
-        assert 370.0 in lines
-        assert len(lines[350.0]) == 2
-        assert len(lines[370.0]) == 1
-
-    def test_group_words_filters_above_table_top(self):
-        """Test that words above table_top_y are filtered out."""
-        detector = TableBoundaryDetector(
-            columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300
-        )
-        words = [
-            {"text": "Above", "x0": 30, "top": 250},  # Above table_top_y
-            {"text": "Below", "x0": 30, "top": 350},  # Below table_top_y
-        ]
-        lines = detector._group_words_by_y(words)
-        assert len(lines) == 1
-        assert 350.0 in lines
-
     def test_find_transaction_positions(self):
         """Test finding transaction positions."""
         detector = TableBoundaryDetector(
@@ -220,64 +190,6 @@ def test_detect_by_spatial_gaps_requires_no_transactions_after(self):
         # Should return None because there's a transaction after the gap
         assert result is None
 
-    def test_build_row_from_words(self):
-        """Test building a row from words."""
-        detector = TableBoundaryDetector(
-            columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300
-        )
-        words = [
-            {"text": "01", "x0": 30, "top": 350},
-            {"text": "Jan", "x0": 35, "top": 350},
-            {"text": "Purchase", "x0": 60, "top": 350},
-            {"text": "50.00", "x0": 210, "top": 350},
-        ]
-        row = detector._build_row_from_words(words)
-        assert "01 Jan" in row["Date"]
-        assert "Purchase" in row["Details"]
-        assert "50.00" in row["Debit €"]
-
-    def test_calculate_column_coverage_full(self):
-        """Test column coverage calculation with full coverage."""
-        detector = TableBoundaryDetector(
-            columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300
-        )
-        rows = [
-            {
-                "Date": "01 Jan",
-                "Details": "Purchase",
-                "Debit €": "50.00",
-                "Credit €": "",
-                "Balance €": "100.00",
-            }
-        ]
-        coverage = detector._calculate_column_coverage(rows)
-        assert coverage == 0.8  # 4 out of 5 columns (Credit is empty)
-
-    def test_calculate_column_coverage_partial(self):
-        """Test column coverage calculation with partial coverage."""
-        detector = TableBoundaryDetector(
-            columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300
-        )
-        rows = [
-            {
-                "Date": "",
-                "Details": "Text",
-                "Debit €": "",
-                "Credit €": "",
-                "Balance €": "",
-            }
-        ]
-        coverage = detector._calculate_column_coverage(rows)
-        assert coverage == 0.2  # 1 out of 5 columns
-
-    def test_calculate_column_coverage_empty(self):
-        """Test column coverage calculation with empty rows."""
-        detector = TableBoundaryDetector(
-            columns=TEST_COLUMNS, fallback_bottom_y=720, table_top_y=300
-        )
-        coverage = detector._calculate_column_coverage([])
-        assert coverage == 0.0
-
     def test_detect_by_structure_breakdown(self):
         """Test detection by column structure breakdown."""
         detector = TableBoundaryDetector(
diff --git a/packages/parser-core/tests/services/test_header_detection.py b/packages/parser-core/tests/services/test_header_detection.py
index 0ba81d7..7d32b59 100644
--- a/packages/parser-core/tests/services/test_header_detection.py
+++ b/packages/parser-core/tests/services/test_header_detection.py
@@ -153,23 +153,6 @@ def test_detect_headers_custom_min_keywords(self, service, sample_columns):
         result = service.detect_headers(words, sample_columns, min_keywords=1)
         assert result is True
 
-    def test_group_words_by_y_coordinate(self, service):
-        """Test word grouping by Y coordinate."""
-        words = [
-            {"text": "A", "top": 100.0},
-            {"text": "B", "top": 100.5},  # Close to 100, should group
-            {"text": "C", "top": 105.0},
-            {"text": "D", "top": 105.2},  # Close to 105, should group
-        ]
-        grouped = service._group_words_by_y_coordinate(words)
-
-        # Should have 2 groups (rounded to 100 and 105)
-        assert len(grouped) == 2
-        assert 100.0 in grouped
-        assert 105.0 in grouped
-        assert len(grouped[100.0]) == 2  # A and B
-        assert len(grouped[105.0]) == 2  # C and D
-
     def test_header_keywords_coverage(self, service):
         """Test that service has comprehensive header keywords."""
         # Verify key banking terms are included

From 3307b34141169e404215fe5ebb95f61fb48c3d3d Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:51:42 +0000
Subject: [PATCH 11/14] style: apply black formatting to parser-core

---
 .../extraction/row_builder.py                 |  9 ++++-
 .../services/pdf_processing_orchestrator.py   |  6 ++-
 .../templates/template_detector.py            | 11 +++--
 .../tests/extraction/test_word_utils.py       | 27 +++++++++++--
 .../services/test_extraction_orchestrator.py  | 14 ++++---
 .../tests/test_credit_card_detection.py       |  4 +-
 .../tests/test_excluded_files_logging.py      | 40 +++++++++++++------
 .../tests/test_pdf_table_extractor.py         | 16 +++++---
 packages/parser-core/tests/test_processor.py  | 29 ++++++++++----
 .../test_processor_refactored_methods.py      | 34 +++++++++++-----
 .../test_transaction_data_no_blocking.py      | 26 +++++++-----
 11 files changed, 153 insertions(+), 63 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py
index 47fb246..c1f59fd 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/row_builder.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/row_builder.py
@@ -9,7 +9,10 @@
 import logging
 from typing import TYPE_CHECKING
 
-from bankstatements_core.extraction.word_utils import assign_words_to_columns, group_words_by_y
+from bankstatements_core.extraction.word_utils import (
+    assign_words_to_columns,
+    group_words_by_y,
+)
 
 if TYPE_CHECKING:
     from bankstatements_core.extraction.row_classifiers import RowClassifier
@@ -46,7 +49,9 @@ def build_rows(self, words: list[dict]) -> list[dict]:
         lines = group_words_by_y(words)
         page_rows = []
         for _, line_words in sorted(lines.items()):
-            row = assign_words_to_columns(line_words, self._columns, strict_rightmost=True)
+            row = assign_words_to_columns(
+                line_words, self._columns, strict_rightmost=True
+            )
             if any(row.values()):
                 row_type = self._row_classifier.classify(row, self._columns)
                 if row_type in ["transaction", "continuation"]:
diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py
index 79cc0a0..95f8a9a 100644
--- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py
+++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py
@@ -136,7 +136,11 @@ def process_all_pdfs(
                 pages_read += result.page_count
 
                 # Check if should be excluded (no IBAN and no data)
-                if result.iban is None and len(result.transactions) == 0 and result.page_count > 0:
+                if (
+                    result.iban is None
+                    and len(result.transactions) == 0
+                    and result.page_count > 0
+                ):
                     excluded_files.append(
                         {
                             "filename": pdf.name,
diff --git a/packages/parser-core/src/bankstatements_core/templates/template_detector.py b/packages/parser-core/src/bankstatements_core/templates/template_detector.py
index 6791451..9c481a7 100644
--- a/packages/parser-core/src/bankstatements_core/templates/template_detector.py
+++ b/packages/parser-core/src/bankstatements_core/templates/template_detector.py
@@ -117,7 +117,9 @@ class DetectionExplanation:
 class TemplateDetector:
     """Orchestrates multi-signal template detection using confidence-based scoring."""
 
-    def __init__(self, registry: TemplateRegistry, scoring: ScoringConfig | None = None):
+    def __init__(
+        self, registry: TemplateRegistry, scoring: ScoringConfig | None = None
+    ):
         """Initialize detector with template registry.
 
         Args:
@@ -542,7 +544,9 @@ def get_detection_explanation(
         if tie_broken:
             # Determine reason without re-running full tie-break
             has_iban = {
-                tid: any(d.detector_name == "IBAN" for d in template_details.get(tid, []))
+                tid: any(
+                    d.detector_name == "IBAN" for d in template_details.get(tid, [])
+                )
                 for tid in tied_templates
             }
             if any(has_iban.values()):
@@ -550,7 +554,8 @@ def get_detection_explanation(
             else:
                 max_confs = {
                     tid: max(
-                        (d.confidence for d in template_details.get(tid, [])), default=0.0
+                        (d.confidence for d in template_details.get(tid, [])),
+                        default=0.0,
                     )
                     for tid in tied_templates
                 }
diff --git a/packages/parser-core/tests/extraction/test_word_utils.py b/packages/parser-core/tests/extraction/test_word_utils.py
index 79f14a4..f404ea4 100644
--- a/packages/parser-core/tests/extraction/test_word_utils.py
+++ b/packages/parser-core/tests/extraction/test_word_utils.py
@@ -229,15 +229,32 @@ class TestCalculateColumnCoverage:
     def test_all_columns_have_data_returns_one(self):
         """When every column has data in at least one row, coverage is 1.0."""
         rows = [
-            {"Date": "01/01", "Details": "Tesco", "Debit": "12.50", "Credit": "", "Balance": "100.00"},
-            {"Date": "", "Details": "AIB", "Debit": "", "Credit": "50.00", "Balance": ""},
+            {
+                "Date": "01/01",
+                "Details": "Tesco",
+                "Debit": "12.50",
+                "Credit": "",
+                "Balance": "100.00",
+            },
+            {
+                "Date": "",
+                "Details": "AIB",
+                "Debit": "",
+                "Credit": "50.00",
+                "Balance": "",
+            },
         ]
         result = calculate_column_coverage(rows, COLUMNS)
         assert result == 1.0
 
     def test_partial_coverage(self):
         """2 of 4 columns having data returns 0.5."""
-        two_col_columns = {"Date": (0, 50), "Details": (50, 200), "Debit": (200, 250), "Credit": (250, 300)}
+        two_col_columns = {
+            "Date": (0, 50),
+            "Details": (50, 200),
+            "Debit": (200, 250),
+            "Credit": (250, 300),
+        }
         rows = [
             {"Date": "01/01", "Details": "Tesco", "Debit": "", "Credit": ""},
         ]
@@ -265,7 +282,9 @@ def test_whitespace_only_values_not_counted(self):
 
     def test_single_row_single_column_has_data(self):
         """1 of 5 columns having data in one row returns 0.2."""
-        rows = [{"Date": "", "Details": "Tesco", "Debit": "", "Credit": "", "Balance": ""}]
+        rows = [
+            {"Date": "", "Details": "Tesco", "Debit": "", "Credit": "", "Balance": ""}
+        ]
         result = calculate_column_coverage(rows, COLUMNS)
         assert result == pytest.approx(0.2)
 
diff --git a/packages/parser-core/tests/services/test_extraction_orchestrator.py b/packages/parser-core/tests/services/test_extraction_orchestrator.py
index 49d4c84..9f91a9b 100644
--- a/packages/parser-core/tests/services/test_extraction_orchestrator.py
+++ b/packages/parser-core/tests/services/test_extraction_orchestrator.py
@@ -37,7 +37,9 @@ def test_extract_from_pdf_success(self, mock_extract):
 
         # Mock successful extraction
         mock_extract.return_value = ExtractionResult(
-            transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test"}]),
+            transactions=dicts_to_transactions(
+                [{"Date": "01/01/23", "Details": "Test"}]
+            ),
             page_count=5,
             iban="IE12BOFI90000112345",
             source_file=pdf_path,
@@ -109,10 +111,12 @@ def test_extract_from_pdf_returns_rows_as_is(self, mock_extract):
         pdf_path.write_text("fake pdf")
 
         mock_extract.return_value = ExtractionResult(
-            transactions=dicts_to_transactions([
-                {"Date": "01/01/23", "Details": "Test1"},
-                {"Date": "02/01/23", "Details": "Test2"},
-            ]),
+            transactions=dicts_to_transactions(
+                [
+                    {"Date": "01/01/23", "Details": "Test1"},
+                    {"Date": "02/01/23", "Details": "Test2"},
+                ]
+            ),
             page_count=3,
             iban=None,
             source_file=pdf_path,
diff --git a/packages/parser-core/tests/test_credit_card_detection.py b/packages/parser-core/tests/test_credit_card_detection.py
index 85935a6..76e3791 100644
--- a/packages/parser-core/tests/test_credit_card_detection.py
+++ b/packages/parser-core/tests/test_credit_card_detection.py
@@ -175,7 +175,9 @@ def test_all_credit_card_patterns(self, mock_pdfplumber):
             extractor = PDFTableExtractor(columns=TEST_COLUMNS)
             result = extractor.extract(Path("/tmp/test.pdf"))
 
-            assert len(result.transactions) == 0, f"Failed to detect with pattern: {pattern_name}"
+            assert (
+                len(result.transactions) == 0
+            ), f"Failed to detect with pattern: {pattern_name}"
 
     @patch("bankstatements_core.adapters.pdfplumber_adapter.pdfplumber.open")
     def test_does_not_detect_false_positives(self, mock_pdfplumber):
diff --git a/packages/parser-core/tests/test_excluded_files_logging.py b/packages/parser-core/tests/test_excluded_files_logging.py
index 38492df..d59b70a 100644
--- a/packages/parser-core/tests/test_excluded_files_logging.py
+++ b/packages/parser-core/tests/test_excluded_files_logging.py
@@ -43,7 +43,9 @@ def test_excluded_files_json_created(self):
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
                 mock_extract.return_value = ExtractionResult(
-                    transactions=[], page_count=1, iban=None,
+                    transactions=[],
+                    page_count=1,
+                    iban=None,
                     source_file=Path("credit_card.pdf"),
                     warnings=["credit card statement detected, skipped"],
                 )
@@ -84,9 +86,9 @@ def test_excluded_files_json_not_created_when_no_exclusions(self):
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
                 mock_extract.return_value = ExtractionResult(
-                    transactions=dicts_to_transactions([
-                        {"Date": "01/12/23", "Details": "Test", "Debit €": "100"}
-                    ]),
+                    transactions=dicts_to_transactions(
+                        [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}]
+                    ),
                     page_count=1,
                     iban="IE29AIBK93115212345678",
                     source_file=Path("bank_statement.pdf"),
@@ -117,7 +119,9 @@ def test_excluded_files_multiple_exclusions(self):
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
                 mock_extract.return_value = ExtractionResult(
-                    transactions=[], page_count=2, iban=None,
+                    transactions=[],
+                    page_count=2,
+                    iban=None,
                     source_file=Path("credit_card.pdf"),
                     warnings=["credit card statement detected, skipped"],
                 )
@@ -160,15 +164,17 @@ def test_excluded_files_mixed_scenarios(self):
             def mock_extract_side_effect(pdf_path, *args, **kwargs):
                 if "credit_card" in str(pdf_path):
                     return ExtractionResult(
-                        transactions=[], page_count=1, iban=None,
+                        transactions=[],
+                        page_count=1,
+                        iban=None,
                         source_file=pdf_path,
                         warnings=["credit card statement detected, skipped"],
                     )
                 else:
                     return ExtractionResult(
-                        transactions=dicts_to_transactions([
-                            {"Date": "01/12/23", "Details": "Test", "Debit €": "100"}
-                        ]),
+                        transactions=dicts_to_transactions(
+                            [{"Date": "01/12/23", "Details": "Test", "Debit €": "100"}]
+                        ),
                         page_count=1,
                         iban="IE29AIBK93115212345678",
                         source_file=pdf_path,
@@ -208,7 +214,9 @@ def test_excluded_files_json_structure(self):
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
                 mock_extract.return_value = ExtractionResult(
-                    transactions=[], page_count=3, iban=None,
+                    transactions=[],
+                    page_count=3,
+                    iban=None,
                     source_file=Path("test.pdf"),
                     warnings=["credit card statement detected, skipped"],
                 )
@@ -327,7 +335,9 @@ def test_pdf_with_iban_not_excluded(self):
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
                 mock_extract.return_value = ExtractionResult(
-                    transactions=[], page_count=2, iban="IE29AIBK93115212345678",
+                    transactions=[],
+                    page_count=2,
+                    iban="IE29AIBK93115212345678",
                     source_file=Path("bank_statement.pdf"),
                 )
 
@@ -361,13 +371,17 @@ def mock_extract_side_effect(pdf_path, *args, **kwargs):
                 if "bank_with_iban" in str(pdf_path):
                     # Has IBAN, empty rows (should NOT be excluded)
                     return ExtractionResult(
-                        transactions=[], page_count=2, iban="IE29AIBK93115212345678",
+                        transactions=[],
+                        page_count=2,
+                        iban="IE29AIBK93115212345678",
                         source_file=pdf_path,
                     )
                 else:
                     # No IBAN, empty rows (SHOULD be excluded)
                     return ExtractionResult(
-                        transactions=[], page_count=1, iban=None,
+                        transactions=[],
+                        page_count=1,
+                        iban=None,
                         source_file=pdf_path,
                         warnings=["credit card statement detected, skipped"],
                     )
diff --git a/packages/parser-core/tests/test_pdf_table_extractor.py b/packages/parser-core/tests/test_pdf_table_extractor.py
index d515393..1d2bec0 100644
--- a/packages/parser-core/tests/test_pdf_table_extractor.py
+++ b/packages/parser-core/tests/test_pdf_table_extractor.py
@@ -84,7 +84,9 @@ def test_extract_tables_with_date_propagation(self, mock_pdfplumber):
         self.assertEqual(result.transactions[0].filename, "test.pdf")
 
         # Check second transaction (should inherit date from first)
-        self.assertEqual(result.transactions[1].date.strip(), "01 Jan 2024")  # Inherited!
+        self.assertEqual(
+            result.transactions[1].date.strip(), "01 Jan 2024"
+        )  # Inherited!
         self.assertEqual(result.transactions[1].details.strip(), "Coffee Shop")
         self.assertEqual(result.transactions[1].debit.strip(), "5.50")
 
@@ -416,7 +418,9 @@ def crop_side_effect(*args):
 
             # Should have fewer rows (administrative content filtered)
             self.assertEqual(len(dynamic_result.transactions), 1)
-            self.assertEqual(dynamic_result.transactions[0].details.strip(), "VDC-STORE")
+            self.assertEqual(
+                dynamic_result.transactions[0].details.strip(), "VDC-STORE"
+            )
 
         run_test()
 
@@ -638,9 +642,7 @@ def test_extract_tables_with_page_validation_enabled(self, mock_pdfplumber):
         test_pdf_path = Path("/tmp/test.pdf")
 
         # Test with page validation enabled
-        result = extract_tables_from_pdf(
-            test_pdf_path, enable_page_validation=True
-        )
+        result = extract_tables_from_pdf(test_pdf_path, enable_page_validation=True)
 
         # Should have processed only the valid page
         self.assertEqual(result.page_count, 2)  # Total pages processed
@@ -657,7 +659,9 @@ def test_extract_tables_with_page_validation_enabled(self, mock_pdfplumber):
         )
 
         # Should process both pages (more rows due to invalid page content)
-        self.assertGreaterEqual(len(result_no_val.transactions), len(result.transactions))
+        self.assertGreaterEqual(
+            len(result_no_val.transactions), len(result.transactions)
+        )
 
     def test_analyze_content_density_empty_word_groups(self):
         """Test analyze_content_density with empty word groups."""
diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py
index eb8d05e..1b43ca0 100644
--- a/packages/parser-core/tests/test_processor.py
+++ b/packages/parser-core/tests/test_processor.py
@@ -289,13 +289,21 @@ def test_processor_continues_after_pdf_error(self, mock_extract):
         # Mock: first PDF succeeds, second fails, third succeeds
         mock_extract.side_effect = [
             ExtractionResult(
-                transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]),
-                page_count=1, iban=None, source_file=Path("good.pdf"),
+                transactions=dicts_to_transactions(
+                    [{"Date": "01 Jan 2024", "Details": "Transaction 1"}]
+                ),
+                page_count=1,
+                iban=None,
+                source_file=Path("good.pdf"),
             ),
             OSError("Failed to open PDF"),
             ExtractionResult(
-                transactions=dicts_to_transactions([{"Date": "02 Jan 2024", "Details": "Transaction 2"}]),
-                page_count=1, iban=None, source_file=Path("good2.pdf"),
+                transactions=dicts_to_transactions(
+                    [{"Date": "02 Jan 2024", "Details": "Transaction 2"}]
+                ),
+                page_count=1,
+                iban=None,
+                source_file=Path("good2.pdf"),
             ),
         ]
 
@@ -319,12 +327,19 @@ def test_processor_handles_empty_pdf_gracefully(self, mock_extract):
         # Mock: first PDF returns no rows, second has data
         mock_extract.side_effect = [
             ExtractionResult(
-                transactions=[], page_count=1, iban=None, source_file=Path("empty.pdf"),
+                transactions=[],
+                page_count=1,
+                iban=None,
+                source_file=Path("empty.pdf"),
                 warnings=["credit card statement detected, skipped"],
             ),
             ExtractionResult(
-                transactions=dicts_to_transactions([{"Date": "01 Jan 2024", "Details": "Transaction 1"}]),
-                page_count=1, iban=None, source_file=Path("with_data.pdf"),
+                transactions=dicts_to_transactions(
+                    [{"Date": "01 Jan 2024", "Details": "Transaction 1"}]
+                ),
+                page_count=1,
+                iban=None,
+                source_file=Path("with_data.pdf"),
             ),
         ]
 
diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py
index 92ff97d..72d24b9 100644
--- a/packages/parser-core/tests/test_processor_refactored_methods.py
+++ b/packages/parser-core/tests/test_processor_refactored_methods.py
@@ -78,12 +78,20 @@ def test_process_all_pdfs_multiple_files(self, mock_extract):
                 # Mock extract_tables_from_pdf to return different data
                 mock_extract.side_effect = [
                     ExtractionResult(
-                        transactions=dicts_to_transactions([{"Date": "01/01/23", "Details": "Test1"}]),
-                        page_count=5, iban=None, source_file=Path("file1.pdf"),
+                        transactions=dicts_to_transactions(
+                            [{"Date": "01/01/23", "Details": "Test1"}]
+                        ),
+                        page_count=5,
+                        iban=None,
+                        source_file=Path("file1.pdf"),
                     ),
                     ExtractionResult(
-                        transactions=dicts_to_transactions([{"Date": "02/01/23", "Details": "Test2"}]),
-                        page_count=3, iban=None, source_file=Path("file2.pdf"),
+                        transactions=dicts_to_transactions(
+                            [{"Date": "02/01/23", "Details": "Test2"}]
+                        ),
+                        page_count=3,
+                        iban=None,
+                        source_file=Path("file2.pdf"),
                     ),
                 ]
 
@@ -93,8 +101,12 @@ def test_process_all_pdfs_multiple_files(self, mock_extract):
                 self.assertEqual(len(results), 2)
                 self.assertEqual(results[0].page_count, 5)
                 self.assertEqual(results[1].page_count, 3)
-                self.assertEqual(results[0].transactions[0].to_dict()["Details"], "Test1")
-                self.assertEqual(results[1].transactions[0].to_dict()["Details"], "Test2")
+                self.assertEqual(
+                    results[0].transactions[0].to_dict()["Details"], "Test1"
+                )
+                self.assertEqual(
+                    results[1].transactions[0].to_dict()["Details"], "Test2"
+                )
                 self.assertIsNone(results[0].iban)
                 self.assertIsNone(results[1].iban)
 
@@ -332,10 +344,12 @@ def test_run_uses_refactored_methods(self, mock_extract):
 
             # Mock extract_tables_from_pdf
             mock_extract.return_value = ExtractionResult(
-                transactions=dicts_to_transactions([
-                    {"Date": "15/06/23", "Details": "Second", "Debit €": "200"},
-                    {"Date": "01/01/23", "Details": "First", "Debit €": "100"},
-                ]),
+                transactions=dicts_to_transactions(
+                    [
+                        {"Date": "15/06/23", "Details": "Second", "Debit €": "200"},
+                        {"Date": "01/01/23", "Details": "First", "Debit €": "100"},
+                    ]
+                ),
                 page_count=2,
                 iban=None,
                 source_file=Path("test.pdf"),
diff --git a/packages/parser-core/tests/test_transaction_data_no_blocking.py b/packages/parser-core/tests/test_transaction_data_no_blocking.py
index ae2596f..5a062e0 100644
--- a/packages/parser-core/tests/test_transaction_data_no_blocking.py
+++ b/packages/parser-core/tests/test_transaction_data_no_blocking.py
@@ -201,13 +201,15 @@ def test_processor_does_not_use_transaction_data_for_exclusion(self):
             ) as mock_extract:
                 # Returns: ExtractionResult with VISA in transactions, IBAN from header
                 mock_extract.return_value = ExtractionResult(
-                    transactions=dicts_to_transactions([
-                        {
-                            "Date": "11 Aug",
-                            "Details": "MOBI CLICK VISA",
-                            "Debit €": "100.00",
-                        }
-                    ]),
+                    transactions=dicts_to_transactions(
+                        [
+                            {
+                                "Date": "11 Aug",
+                                "Details": "MOBI CLICK VISA",
+                                "Debit €": "100.00",
+                            }
+                        ]
+                    ),
                     page_count=1,
                     iban="IE48AIBK93408921459015",
                     source_file=Path("test.pdf"),
@@ -246,10 +248,12 @@ def test_all_card_brands_in_transactions_allowed(self):
                 "bankstatements_core.services.extraction_orchestrator.extract_tables_from_pdf"
             ) as mock_extract:
                 mock_extract.return_value = ExtractionResult(
-                    transactions=dicts_to_transactions([
-                        {"Date": "11 Aug", "Details": detail}
-                        for detail in transaction_details
-                    ]),
+                    transactions=dicts_to_transactions(
+                        [
+                            {"Date": "11 Aug", "Details": detail}
+                            for detail in transaction_details
+                        ]
+                    ),
                     page_count=1,
                     iban="IE48AIBK93408921459015",
                     source_file=Path("test.pdf"),

From d9ade9b8de68999659f21dbf65f22d8e681ff5b1 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:53:30 +0000
Subject: [PATCH 12/14] style: fix isort import ordering in parser-core

---
 .../src/bankstatements_core/extraction/extraction_facade.py   | 2 +-
 .../src/bankstatements_core/extraction/pdf_extractor.py       | 4 ++--
 packages/parser-core/src/bankstatements_core/processor.py     | 4 ++--
 packages/parser-core/tests/test_pdf_table_extractor.py        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py
index 7fdc3d6..7c4f9c0 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py
@@ -11,8 +11,8 @@
 from typing import TYPE_CHECKING
 
 from bankstatements_core.config.column_config import DEFAULT_COLUMNS
-from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y
 from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y
 
 if TYPE_CHECKING:
     from bankstatements_core.extraction.row_classifiers import RowClassifier
diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py
index 549e6c2..b1648a2 100644
--- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py
+++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py
@@ -13,6 +13,8 @@
 if TYPE_CHECKING:
     from bankstatements_core.domain.protocols.pdf_reader import IPDFReader
 
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import dicts_to_transactions
 from bankstatements_core.extraction.iban_extractor import IBANExtractor
 from bankstatements_core.extraction.page_header_analyser import PageHeaderAnalyser
 from bankstatements_core.extraction.row_builder import RowBuilder
@@ -22,8 +24,6 @@
     StatefulPageRowProcessor,
     extract_filename_date,
 )
-from bankstatements_core.domain import ExtractionResult
-from bankstatements_core.domain.converters import dicts_to_transactions
 
 logger = logging.getLogger(__name__)
 
diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py
index 5da8dca..1d410d5 100644
--- a/packages/parser-core/src/bankstatements_core/processor.py
+++ b/packages/parser-core/src/bankstatements_core/processor.py
@@ -11,6 +11,8 @@
 from bankstatements_core.config.column_config import DEFAULT_COLUMNS, get_column_names
 from bankstatements_core.config.processor_config import ProcessorConfig
 from bankstatements_core.config.totals_config import parse_totals_columns  # noqa: F401
+from bankstatements_core.domain import ExtractionResult
+from bankstatements_core.domain.converters import transactions_to_dicts
 from bankstatements_core.services.column_analysis import ColumnAnalysisService
 from bankstatements_core.services.date_parser import DateParserService
 from bankstatements_core.services.duplicate_detector import DuplicateDetectionService
@@ -30,8 +32,6 @@
     TransactionProcessingOrchestrator,
 )
 from bankstatements_core.utils import is_date_column, to_float  # noqa: F401
-from bankstatements_core.domain import ExtractionResult
-from bankstatements_core.domain.converters import transactions_to_dicts
 
 logger = logging.getLogger(__name__)
 
diff --git a/packages/parser-core/tests/test_pdf_table_extractor.py b/packages/parser-core/tests/test_pdf_table_extractor.py
index 1d2bec0..500fa69 100644
--- a/packages/parser-core/tests/test_pdf_table_extractor.py
+++ b/packages/parser-core/tests/test_pdf_table_extractor.py
@@ -6,9 +6,9 @@
 from pathlib import Path
 from unittest.mock import MagicMock, Mock, patch
 
-from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier
 from bankstatements_core.domain import ExtractionResult
 from bankstatements_core.domain.converters import dicts_to_transactions
+from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier
 from bankstatements_core.pdf_table_extractor import (
     DEFAULT_COLUMNS,
     analyze_content_density,

From 4d609703faad7754a0cb35d01f9888c693b60eb8 Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:56:57 +0000
Subject: [PATCH 13/14] fix: remove unused imports and variables flagged by
 flake8

---
 .../src/bankstatements_core/templates/template_detector.py      | 2 +-
 packages/parser-core/tests/extraction/test_pdf_extractor.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/templates/template_detector.py b/packages/parser-core/src/bankstatements_core/templates/template_detector.py
index 9c481a7..7ca1ba2 100644
--- a/packages/parser-core/src/bankstatements_core/templates/template_detector.py
+++ b/packages/parser-core/src/bankstatements_core/templates/template_detector.py
@@ -5,7 +5,7 @@
 import logging
 import re
 from collections import defaultdict
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
 
 from pdfplumber.page import Page
diff --git a/packages/parser-core/tests/extraction/test_pdf_extractor.py b/packages/parser-core/tests/extraction/test_pdf_extractor.py
index 5fa3e2b..a0424f7 100644
--- a/packages/parser-core/tests/extraction/test_pdf_extractor.py
+++ b/packages/parser-core/tests/extraction/test_pdf_extractor.py
@@ -538,7 +538,7 @@ def test_per_page_header_check_top_y_override(self, mock_pdfplumber):
             enable_header_check=True,  # Enable header check
         )
 
-        result = extractor.extract(Path("/tmp/test.pdf"))
+        extractor.extract(Path("/tmp/test.pdf"))
 
         # Verify header area crop was called with override value (450)
         header_crop_calls = [

From 60df9ad55e2280ecc5398e5e5306100d46a6770e Mon Sep 17 00:00:00 2001
From: longieirl <noreply@github.com>
Date: Wed, 25 Mar 2026 10:59:46 +0000
Subject: [PATCH 14/14] fix: rename loop var to avoid ExtractionResult/dict
 type collision in mypy

---
 .../parser-core/src/bankstatements_core/processor.py   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py
index 1d410d5..e049574 100644
--- a/packages/parser-core/src/bankstatements_core/processor.py
+++ b/packages/parser-core/src/bankstatements_core/processor.py
@@ -314,11 +314,11 @@ def run(self) -> dict:
         all_rows: list[dict] = []
         pages_read = 0
         pdf_ibans: dict[str, str] = {}
-        for result in extraction_results:
-            pages_read += result.page_count
-            if result.iban:
-                pdf_ibans[result.source_file.name] = result.iban
-            all_rows.extend(transactions_to_dicts(result.transactions))
+        for extraction in extraction_results:
+            pages_read += extraction.page_count
+            if extraction.iban:
+                pdf_ibans[extraction.source_file.name] = extraction.iban
+            all_rows.extend(transactions_to_dicts(extraction.transactions))
         pdf_count = len(sorted(self.input_dir.glob("*.pdf")))
 
         # Step 2: Group transactions by IBAN (delegated to orchestrator)