diff --git a/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py b/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py index 087bbef..f0b22bc 100644 --- a/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py +++ b/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py @@ -24,6 +24,10 @@ class ExtractionResult: card_number: Card number extracted from credit card statement header, or None for bank statements. Set to "unknown" when CC PDF is detected on paid tier but no card number pattern matches. + statement_year: Year inferred from a document-level date field (e.g. + "Payment Due Date: 3 Mar 2026" → 2026). Used to resolve yearless + transaction dates (e.g. "3 Feb") at sort time. None when the year + could not be determined from the PDF. """ transactions: list[Transaction] @@ -32,3 +36,4 @@ class ExtractionResult: source_file: Path warnings: list[ExtractionWarning] = field(default_factory=list) card_number: str | None = None + statement_year: int | None = None diff --git a/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py b/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py index 30bbdbe..4ea6713 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py +++ b/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py @@ -25,6 +25,15 @@ _IBAN_HEADER_Y = 350 +# Patterns for extracting the statement year from a payment due date field. +# Matches lines like: +# "Payment Due 3 Mar 2026" +# "Payment Due Date: 20 Feb 2026" +_PAYMENT_DUE_PATTERNS = [ + r"Payment\s+Due\s+Date\s*[:\s]\s*\d{1,2}\s+\w+\s+(\d{4})", + r"Payment\s+Due\s+\d{1,2}\s+\w+\s+(\d{4})", +] + class PageHeaderAnalyser: """Inspects the page header area for credit card indicators and IBAN.""" @@ -57,6 +66,34 @@ def is_credit_card_statement(self, page: Any, table_top_y: int) -> bool: logger.warning("Error checking for credit card statement: %s", e) return False + def extract_statement_year(self, page: Any) -> int | None: + """Extract the statement year from a 'Payment Due' or 'Payment Due Date' field. + + Scans the full page 1 text for patterns like: + - "Payment Due 3 Mar 2026" + - "Payment Due Date: 20 Feb 2026" + + Args: + page: pdfplumber page object (page 1 only) + + Returns: + Four-digit year as int if found, None otherwise + """ + try: + page_text = page.extract_text() + if page_text: + for pattern in _PAYMENT_DUE_PATTERNS: + match = re.search(pattern, page_text, re.IGNORECASE) + if match: + year = int(match.group(1)) + logger.debug( + "Statement year %d extracted from 'Payment Due' field", year + ) + return year + except (AttributeError, ValueError, TypeError) as e: + logger.warning("Error extracting statement year from page: %s", e) + return None + def extract_iban(self, page: Any) -> str | None: """Extract account IBAN from the page header area (y < 350). diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index 596a5b7..084d0fa 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -94,22 +94,15 @@ def extract(self, pdf_path: Path) -> ExtractionResult: rows: list[dict] = [] iban = None card_number: str | None = None - - filename_date = extract_filename_date(pdf_path.name) - page_processor = StatefulPageRowProcessor( - RowPostProcessor( - columns=self.columns, - row_classifier=self._row_classifier, - template=self.template, - filename_date=filename_date, - filename=pdf_path.name, - ) - ) + statement_year: int | None = None with self._pdf_reader.open(pdf_path) as pdf: - for page_num, page in enumerate(pdf.pages, 1): - if page_num == 1 and self._header_analyser.is_credit_card_statement( - page, self.table_top_y + # --- Page 1 pre-scan: gather document-level metadata before processing rows --- + if pdf.pages: + page1 = pdf.pages[0] + + if self._header_analyser.is_credit_card_statement( + page1, self.table_top_y ): if self._entitlements is None or self._entitlements.require_iban: logger.warning( @@ -129,27 +122,40 @@ def extract(self, pdf_path: Path) -> ExtractionResult: ], ) - if iban is None and page_num == 1: - iban = self._header_analyser.extract_iban(page) - if iban: - logger.info( - "IBAN found on page %s: %s****%s", - page_num, - iban[:4], - iban[-4:], + # Paid tier CC: extract card number and statement year up front + extracted = self._extract_card_number(page1) + card_number = extracted if extracted is not None else "unknown" + + statement_year = self._header_analyser.extract_statement_year(page1) + if statement_year is None: + logger.warning( + "Could not determine statement year from '%s'. " + "Yearless dates will not sort correctly.", + pdf_path.name, ) - # Extract card number on page 1 for CC statements (paid tier only) - if page_num == 1 and card_number is None: - extracted = self._extract_card_number(page) - if extracted is not None: - card_number = extracted - elif self._header_analyser.is_credit_card_statement( - page, self.table_top_y - ): - # CC PDF detected on paid tier but no card pattern matched - card_number = "unknown" + iban = self._header_analyser.extract_iban(page1) + if iban: + logger.info( + "IBAN found on page 1: %s****%s", + iban[:4], + iban[-4:], + ) + # Build page processor now that document-level metadata is known + filename_date = extract_filename_date(pdf_path.name) + page_processor = StatefulPageRowProcessor( + RowPostProcessor( + columns=self.columns, + row_classifier=self._row_classifier, + template=self.template, + filename_date=filename_date, + filename=pdf_path.name, + statement_year=statement_year, + ) + ) + + for page_num, page in enumerate(pdf.pages, 1): page_rows = self._extract_page(page, page_num) if page_rows is None: continue @@ -168,6 +174,7 @@ def extract(self, pdf_path: Path) -> ExtractionResult: iban=iban, source_file=pdf_path, card_number=card_number, + statement_year=statement_year, ) def _extract_card_number(self, page: Any) -> str | None: diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py b/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py index 2c44df6..d027450 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py +++ b/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py @@ -218,6 +218,34 @@ def _do_classify( return None +class RefContinuationClassifier(RowClassifier): + """Classifies reference-number continuation lines (e.g. AIB CC 'Ref: 123456'). + + AIB Credit Card PDFs split each transaction across two physical lines: + - Line 1: Transaction Date | Posting Date | Details | Amount + - Line 2: Reference number only (e.g. "Ref: 123456") with the same date + repeated in the Transaction Date column + + Without this classifier, Line 2 is misclassified as a 'transaction' by + TransactionClassifier (it has a date) and emitted as an empty phantom row. + This classifier catches it before TransactionClassifier runs and marks it + as 'continuation' so RowMergerService merges it into the parent transaction. + """ + + _REF_PATTERN = re.compile(r"^Ref\s*:\s*\d+", re.IGNORECASE) + + def _do_classify( + self, row: dict, columns: dict[str, tuple[int | float, int | float]] + ) -> str | None: + """Detect reference-number continuation lines.""" + description_text = self._get_description_text(row, columns) + + if self._REF_PATTERN.match(description_text): + return "continuation" + + return None + + class FXContinuationClassifier(RowClassifier): """Classifies foreign exchange continuation lines.""" @@ -401,10 +429,11 @@ def create_row_classifier_chain() -> RowClassifier: 0 HeaderMetadataClassifier — column headers and field labels 1 AdministrativeClassifier — BALANCE FORWARD, Lending @ 2 ReferenceCodeClassifier — IE123456 patterns - 3 FXContinuationClassifier — FX rates, fees, exchange lines - 4 TimestampMetadataClassifier — 01JAN2023 TIME 14:30 - 5 TransactionClassifier — debit/credit/date combinations - 6 DefaultMetadataClassifier — catch-all fallback + 3 RefContinuationClassifier — Ref: 123456 continuation lines (AIB CC) + 4 FXContinuationClassifier — FX rates, fees, exchange lines + 5 TimestampMetadataClassifier — 01JAN2023 TIME 14:30 + 6 TransactionClassifier — debit/credit/date combinations + 7 DefaultMetadataClassifier — catch-all fallback Returns: The head of the classifier chain @@ -414,9 +443,10 @@ def create_row_classifier_chain() -> RowClassifier: (0, HeaderMetadataClassifier), (1, AdministrativeClassifier), (2, ReferenceCodeClassifier), - (3, FXContinuationClassifier), - (4, TimestampMetadataClassifier), - (5, TransactionClassifier), - (6, DefaultMetadataClassifier), + (3, RefContinuationClassifier), + (4, FXContinuationClassifier), + (5, TimestampMetadataClassifier), + (6, TransactionClassifier), + (7, DefaultMetadataClassifier), ] ).build_chain() diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py b/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py index ebb18b1..6a5c6a2 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py @@ -58,6 +58,7 @@ def __init__( # noqa: PLR0913 filename_date: str, filename: str, scoring_config: ExtractionScoringConfig | None = None, + statement_year: int | None = None, ) -> None: self._columns = columns self._row_classifier = row_classifier @@ -69,6 +70,7 @@ def __init__( # noqa: PLR0913 if scoring_config is not None else ExtractionScoringConfig.default() ) + self._statement_year = statement_year self._date_col = ColumnTypeIdentifier.find_first_column_of_type(columns, "date") self._balance_col = ColumnTypeIdentifier.find_first_column_of_type( columns, "balance" @@ -147,6 +149,8 @@ def process(self, row: dict, current_date: str) -> str: # Metadata tagging row["Filename"] = self._filename + if self._statement_year is not None: + row["statement_year"] = str(self._statement_year) if self._template: row["document_type"] = self._template.document_type row["template_id"] = self._template.id diff --git a/packages/parser-core/src/bankstatements_core/services/date_parser.py b/packages/parser-core/src/bankstatements_core/services/date_parser.py index cee53a5..33ae383 100644 --- a/packages/parser-core/src/bankstatements_core/services/date_parser.py +++ b/packages/parser-core/src/bankstatements_core/services/date_parser.py @@ -43,7 +43,15 @@ class DateParserService: "%d%B%Y", # 01December2023 ] - def parse_transaction_date(self, date_str: str) -> datetime: + # Yearless date formats used by CC statements (e.g. AIB CC: "3 Feb") + YEARLESS_DATE_FORMATS = [ # noqa: RUF012 + "%d %b", # 3 Feb + "%d %B", # 3 February + ] + + def parse_transaction_date( + self, date_str: str, hint_year: int | None = None + ) -> datetime: """ Parse bank statement date string into datetime object. @@ -59,9 +67,13 @@ def parse_transaction_date(self, date_str: str) -> datetime: - DDMMMYY (e.g., "01DEC23") - DDMMMYYYY (e.g., "01DEC2023") - Partial dates: "DD/MM" (missing year) + - Yearless word dates: "3 Feb", "3 February" (requires hint_year) Args: date_str: Date string from bank statement + hint_year: Optional year inferred from a document-level field (e.g. + "Payment Due Date"). Used to resolve yearless dates like "3 Feb". + When None and the date is yearless, epoch is returned. Returns: datetime object, or epoch (1970-01-01) if unparseable @@ -78,6 +90,8 @@ def parse_transaction_date(self, date_str: str) -> datetime: datetime.datetime(2025, 4, 25, 0, 0) >>> service.parse_transaction_date("") datetime.datetime(1970, 1, 1, 0, 0) + >>> service.parse_transaction_date("3 Feb", hint_year=2026) + datetime.datetime(2026, 2, 3, 0, 0) """ # Handle empty or whitespace-only strings if not date_str or not date_str.strip(): @@ -89,11 +103,16 @@ def parse_transaction_date(self, date_str: str) -> datetime: # "Sept" -> "Sep" (Python's datetime uses 3-letter abbreviations) date_str = date_str.replace("Sept", "Sep") - # Try common date formats + # Try common date formats (all include a year component) parsed_date = self._parse_common_date_formats(date_str) if parsed_date is not None: return parsed_date + # Try yearless formats (e.g. "3 Feb" from CC statements) + parsed_date = self._parse_yearless_date(date_str, hint_year) + if parsed_date is not None: + return parsed_date + # Try partial date parsing (DD/MM without year) parsed_date = self._parse_partial_date(date_str) if parsed_date is not None: @@ -200,6 +219,46 @@ def _parse_common_date_formats(self, date_str: str) -> datetime | None: return None + def _parse_yearless_date( + self, date_str: str, hint_year: int | None + ) -> datetime | None: + """ + Parse yearless date strings like "3 Feb" or "3 February". + + Uses hint_year (from a document-level field such as "Payment Due Date") + to supply the missing year. Returns None when no hint_year is available + so the caller can fall through to epoch and log a warning. + + Args: + date_str: Date string without a year component + hint_year: Year to substitute, or None if unknown + + Returns: + Parsed datetime if format matches and hint_year is provided, None otherwise + + Examples: + >>> service = DateParserService() + >>> service._parse_yearless_date("3 Feb", 2026) + datetime.datetime(2026, 2, 3, 0, 0) + >>> service._parse_yearless_date("3 February", 2026) + datetime.datetime(2026, 2, 3, 0, 0) + >>> service._parse_yearless_date("3 Feb", None) + None + """ + if hint_year is None: + return None + + for fmt in self.YEARLESS_DATE_FORMATS: + # Append a fixed year so strptime never parses a year-free date + # (avoids Python 3.15 deprecation of yearless strptime). + augmented = f"{date_str} 1900" + augmented_fmt = f"{fmt} %Y" + parsed = self._try_parse_date_format(augmented, augmented_fmt) + if parsed is not None: + return parsed.replace(year=hint_year) + + return None + def _parse_partial_date(self, date_str: str) -> datetime | None: """ Parse partial date strings like "DD/MM" or "DD-MM" (missing year). diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index 96e48f0..049e8c1 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -31,10 +31,7 @@ def __init__(self) -> None: """Initialize the row merger service.""" self._last_transaction_row: dict | None = None - def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 - # pylint: disable=too-many-branches - # Row merger heuristic — 17 branches reflect the full set of continuation - # line detection rules. Complexity is inherent to the domain logic. + def merge_continuation_lines( self, rows: list[dict], columns: dict[str, tuple[int | float, int | float]], @@ -61,15 +58,13 @@ def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 if not rows: return rows - # Reset state for this batch self._last_transaction_row = None - # Find description and date columns description_col = find_first_column_of_type(columns, "description") date_col = find_first_column_of_type(columns, "date") if not description_col: - return rows # Can't merge without description column + return rows merged_rows = [] i = 0 @@ -79,78 +74,151 @@ def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 row_type = self._classify_row_type(current_row, columns) if row_type == "transaction": - # Look ahead for continuation lines - continuation_parts = [] - j = i + 1 - - while j < len(rows): - next_row = rows[j] - next_type = self._classify_row_type(next_row, columns) - - if next_type == "continuation": - # Extract the continuation text - continuation_text = next_row.get(description_col, "").strip() - if continuation_text: - continuation_parts.append(continuation_text) - - # If this continuation line has a balance, preserve it - current_row = self._preserve_balance_from_continuation( - current_row, next_row, columns - ) - - j += 1 - elif next_type == "transaction": - # Found next transaction, stop looking for continuations - break - else: - # Other row types (administrative, etc.) - stop looking - break - - # Merge continuation parts into the main transaction description - if continuation_parts: - original_desc = current_row.get(description_col, "").strip() - merged_desc = original_desc + " " + " ".join(continuation_parts) - current_row[description_col] = merged_desc.strip() - - # Store current row as last transaction for date carry-forward + if self._is_date_only_split(current_row, rows, i, date_col, columns): + next_row = rows[i + 1].copy() + next_row[date_col] = current_row[date_col] + rows[i + 1] = next_row + logger.debug( + "Date-only split row: carried date '%s' into next row", + current_row[date_col], + ) + i += 1 + continue + + current_row, j = self._collect_continuations( + current_row, rows, i, description_col, columns + ) self._last_transaction_row = current_row.copy() - merged_rows.append(current_row) - i = j # Skip to after the last continuation line + i = j elif row_type == "continuation": - # Continuation line without preceding transaction - # Check if it's missing a date (date grouping pattern) - if date_col and self._last_transaction_row: - current_date = current_row.get(date_col, "").strip() - if not current_date: - # Carry forward date from last transaction - last_date = self._last_transaction_row.get(date_col, "").strip() - if last_date: - current_row[date_col] = last_date - logger.debug( - "Carried forward date '%s' to continuation row", - last_date, - ) - # Reclassify - it might be a transaction now - row_type = self._classify_row_type(current_row, columns) - + current_row, row_type = self._handle_orphaned_continuation( + current_row, row_type, date_col, columns + ) if row_type == "transaction": - # After date carry-forward, it's now a transaction self._last_transaction_row = current_row.copy() merged_rows.append(current_row) else: - # Still a continuation - skip orphaned line logger.warning("Orphaned continuation line: %s", current_row) i += 1 else: - # Non-transaction, non-continuation row - keep as is merged_rows.append(current_row) i += 1 return merged_rows + def _is_date_only_split( + self, + current_row: dict, + rows: list[dict], + i: int, + date_col: str | None, + columns: dict[str, tuple[int | float, int | float]], + ) -> bool: + """Return True when this row is a date-only PDF split that should be carried forward. + + Detects AIB CC Y-split rows where the transaction date lands at a slightly + different Y-coordinate, causing RowBuilder to emit a standalone date-only row. + """ + desc_col = find_first_column_of_type(columns, "description") + return bool( + date_col + and desc_col + and current_row.get(date_col, "").strip() + and not current_row.get(desc_col, "").strip() + and self._is_date_only_row(current_row, columns) + and i + 1 < len(rows) + and self._classify_row_type(rows[i + 1], columns) == "transaction" + and not rows[i + 1].get(date_col, "").strip() + ) + + def _collect_continuations( + self, + current_row: dict, + rows: list[dict], + i: int, + description_col: str, + columns: dict[str, tuple[int | float, int | float]], + ) -> tuple[dict, int]: + """Scan ahead and merge any continuation lines into current_row. + + Returns the updated row and the index of the next unprocessed row. + """ + continuation_parts: list[str] = [] + j = i + 1 + + while j < len(rows): + next_row = rows[j] + next_type = self._classify_row_type(next_row, columns) + + if next_type == "continuation": + text = next_row.get(description_col, "").strip() + if text: + continuation_parts.append(text) + current_row = self._preserve_balance_from_continuation( + current_row, next_row, columns + ) + j += 1 + else: + break + + if continuation_parts: + original_desc = current_row.get(description_col, "").strip() + current_row[description_col] = ( + original_desc + " " + " ".join(continuation_parts) + ).strip() + + return current_row, j + + def _handle_orphaned_continuation( + self, + current_row: dict, + row_type: str, + date_col: str | None, + columns: dict[str, tuple[int | float, int | float]], + ) -> tuple[dict, str]: + """Attempt to promote an orphaned continuation by carrying forward the last date.""" + if ( + date_col + and self._last_transaction_row + and not current_row.get(date_col, "").strip() + ): + last_date = self._last_transaction_row.get(date_col, "").strip() + if last_date: + current_row[date_col] = last_date + logger.debug("Carried forward date '%s' to continuation row", last_date) + row_type = self._classify_row_type(current_row, columns) + return current_row, row_type + + def _is_date_only_row( + self, + row: dict, + columns: dict[str, tuple[int | float, int | float]], + ) -> bool: + """Return True if the row contains only date-column values and nothing else. + + Used to detect AIB CC Y-split rows where the Transaction Date lands at a + slightly different Y-coordinate than the Posting Date / Details / Amount, + causing RowBuilder to emit a standalone date-only row. + + Args: + row: Row dictionary to inspect + columns: Column definitions + + Returns: + True if all non-empty, non-Filename values belong to date-type columns + """ + from bankstatements_core.domain.column_types import ( # noqa: PLC0415 + get_type_as_string, + ) + + non_empty = {k: v for k, v in row.items() if v.strip() and k != "Filename"} + if not non_empty: + return False + return all(get_type_as_string(k) == "date" for k in non_empty) + def _classify_row_type( self, row: dict, columns: dict[str, tuple[int | float, int | float]] ) -> str: diff --git a/packages/parser-core/src/bankstatements_core/services/sorting_service.py b/packages/parser-core/src/bankstatements_core/services/sorting_service.py index 9e5d0aa..f0cf530 100644 --- a/packages/parser-core/src/bankstatements_core/services/sorting_service.py +++ b/packages/parser-core/src/bankstatements_core/services/sorting_service.py @@ -6,8 +6,10 @@ from __future__ import annotations +import contextlib import logging from abc import ABC, abstractmethod +from datetime import datetime from typing import TYPE_CHECKING from bankstatements_core.services.date_parser import DateParserService @@ -55,10 +57,17 @@ def sort(self, transactions: list[Transaction]) -> list[Transaction]: logger.debug("Sorting %d transactions chronologically", len(transactions)) - return sorted( - transactions, - key=lambda tx: _date_parser_service.parse_transaction_date(tx.date), - ) + def _sort_key(tx: Transaction) -> datetime: + hint_year: int | None = None + raw = tx.additional_fields.get("statement_year") + if raw is not None: + with contextlib.suppress(ValueError): + hint_year = int(raw) + return _date_parser_service.parse_transaction_date( + tx.date, hint_year=hint_year + ) + + return sorted(transactions, key=_sort_key) class NoSortingStrategy(SortingStrategy): diff --git a/packages/parser-core/tests/extraction/test_page_header_analyser.py b/packages/parser-core/tests/extraction/test_page_header_analyser.py index c7a1f12..c9f2a15 100644 --- a/packages/parser-core/tests/extraction/test_page_header_analyser.py +++ b/packages/parser-core/tests/extraction/test_page_header_analyser.py @@ -114,3 +114,53 @@ def test_crops_at_fixed_350(self): analyser = PageHeaderAnalyser(mock_extractor) analyser.extract_iban(page) page.crop.assert_called_once_with((0, 0, page.width, 350)) + + +class TestExtractStatementYear: + """Tests for PageHeaderAnalyser.extract_statement_year.""" + + def _analyser(self) -> PageHeaderAnalyser: + return PageHeaderAnalyser(Mock()) + + def _make_full_page(self, full_text: str) -> Mock: + """Build a mock page where extract_text() returns full_text (no crop needed).""" + page = Mock() + page.extract_text.return_value = full_text + return page + + def test_payment_due_date_colon(self): + page = self._make_full_page("Payment Due Date: 20 Feb 2026\nSome other text") + assert self._analyser().extract_statement_year(page) == 2026 + + def test_payment_due_no_colon(self): + page = self._make_full_page("Payment Due 3 Mar 2026\nBalance: €0.00") + assert self._analyser().extract_statement_year(page) == 2026 + + def test_payment_due_date_different_year(self): + page = self._make_full_page("Payment Due Date: 1 Jan 2025") + assert self._analyser().extract_statement_year(page) == 2025 + + def test_case_insensitive(self): + page = self._make_full_page("PAYMENT DUE DATE: 15 Apr 2026") + assert self._analyser().extract_statement_year(page) == 2026 + + def test_returns_none_when_no_payment_due(self): + page = self._make_full_page("Statement Date: 01 Feb 2026\nBalance: €100.00") + assert self._analyser().extract_statement_year(page) is None + + def test_returns_none_for_empty_text(self): + page = self._make_full_page("") + assert self._analyser().extract_statement_year(page) is None + + def test_returns_none_for_none_text(self): + page = self._make_full_page(None) + assert self._analyser().extract_statement_year(page) is None + + def test_returns_none_on_page_exception(self): + page = Mock() + page.extract_text.side_effect = AttributeError("no text") + assert self._analyser().extract_statement_year(page) is None + + def test_payment_due_date_with_extra_whitespace(self): + page = self._make_full_page("Payment Due Date: 18 Feb 2026") + assert self._analyser().extract_statement_year(page) == 2026 diff --git a/packages/parser-core/tests/extraction/test_row_classifiers.py b/packages/parser-core/tests/extraction/test_row_classifiers.py index b842bd2..c42ea7d 100644 --- a/packages/parser-core/tests/extraction/test_row_classifiers.py +++ b/packages/parser-core/tests/extraction/test_row_classifiers.py @@ -10,6 +10,7 @@ DefaultMetadataClassifier, FXContinuationClassifier, HeaderMetadataClassifier, + RefContinuationClassifier, ReferenceCodeClassifier, RowClassifier, TimestampMetadataClassifier, @@ -258,6 +259,73 @@ def test_non_transaction(self): assert result is None +class TestRefContinuationClassifier: + """Tests for RefContinuationClassifier (AIB CC reference continuation lines).""" + + @pytest.fixture + def classifier(self): + return RefContinuationClassifier() + + @pytest.fixture + def cc_columns(self): + """CC-style columns (no amount col named 'Debit €').""" + return { + "Transaction Date": (29, 80), + "Posting Date": (80, 118), + "Transaction Details": (118, 370), + "Amount": (370, 430), + } + + def test_ref_colon_digits_classified_as_continuation(self, classifier, cc_columns): + """Ref: with no amount is continuation.""" + row = { + "Transaction Date": "4 Feb", + "Posting Date": "", + "Transaction Details": "Ref: 1234567890", + "Amount": "", + } + result = classifier._do_classify(row, cc_columns) + assert result == "continuation" + + def test_ref_no_space_classified_as_continuation(self, classifier, cc_columns): + """Ref: (no space after colon) is also continuation.""" + row = { + "Transaction Date": "4 Feb", + "Transaction Details": "Ref:9876543", + "Amount": "", + } + result = classifier._do_classify(row, cc_columns) + assert result == "continuation" + + def test_ref_with_amount_still_continuation(self, classifier, cc_columns): + """Ref: pattern always classified as continuation regardless of amount. + + The 'Amount' column on CC templates doesn't map to a typed debit/credit column, + so we match purely on the description pattern — Ref: is always a + reference continuation line in practice. + """ + row = { + "Transaction Date": "4 Feb", + "Transaction Details": "Ref: 1234567890", + "Amount": "50.00", + } + result = classifier._do_classify(row, cc_columns) + assert result == "continuation" + + def test_regular_transaction_not_matched(self, classifier): + """Normal transaction description not affected.""" + row = {"Date": "4 Feb", "Details": "PAYPAL *CLEVERBRIDG", "Debit €": "84.54"} + result = classifier._do_classify(row, TEST_COLUMNS) + assert result is None + + def test_reference_word_in_middle_not_matched(self, classifier): + """'Ref:' in the middle of a description is not matched (anchored to start).""" + row = {"Date": "", "Details": "Payment Ref: 123 extra", "Debit €": ""} + # Pattern requires digits immediately after Ref: with no other text + result = classifier._do_classify(row, TEST_COLUMNS) + assert result is None + + class TestDefaultMetadataClassifier: """Tests for DefaultMetadataClassifier.""" @@ -293,6 +361,20 @@ def test_chain_classifies_reference(self): result = chain.classify(row, TEST_COLUMNS) assert result == "reference" + def test_chain_classifies_ref_continuation(self): + """Test chain classifies AIB CC Ref: line as continuation (not transaction).""" + chain = create_row_classifier_chain() + # AIB CC: date repeats on the Ref line — without this classifier, + # TransactionClassifier would see the date and emit a phantom empty row. + row = { + "Date": "4 Feb", + "Details": "Ref: 1234567890", + "Debit €": "", + "Filename": "test", + } + result = chain.classify(row, TEST_COLUMNS) + assert result == "continuation" + def test_chain_classifies_fx_continuation(self): """Test chain correctly classifies FX continuation.""" chain = create_row_classifier_chain() diff --git a/packages/parser-core/tests/extraction/test_row_post_processor.py b/packages/parser-core/tests/extraction/test_row_post_processor.py index 5544d04..75a89a0 100644 --- a/packages/parser-core/tests/extraction/test_row_post_processor.py +++ b/packages/parser-core/tests/extraction/test_row_post_processor.py @@ -377,3 +377,56 @@ def test_bank_statement_document_type_unchanged(self): } proc.process(row, "") assert row["document_type"] == "bank_statement" + + def test_statement_year_stamped_on_transaction_row(self): + """statement_year is stamped as a string on each transaction row when provided.""" + proc = RowPostProcessor( + columns=TEST_COLUMNS, + row_classifier=_make_classifier("transaction"), + template=None, + filename_date="", + filename="statement.pdf", + statement_year=2026, + ) + row = { + "Date": "3 Feb", + "Details": "Purchase", + "Debit €": "", + "Credit €": "", + "Balance €": "", + } + proc.process(row, "") + assert row["statement_year"] == "2026" + + def test_statement_year_not_stamped_when_none(self): + """statement_year key absent from row when not provided.""" + proc = _make_processor() + row = { + "Date": "3 Feb", + "Details": "Purchase", + "Debit €": "", + "Credit €": "", + "Balance €": "", + } + proc.process(row, "") + assert "statement_year" not in row + + def test_statement_year_only_on_transaction_rows(self): + """statement_year is not stamped on non-transaction rows.""" + proc = RowPostProcessor( + columns=TEST_COLUMNS, + row_classifier=_make_classifier("header"), + template=None, + filename_date="", + filename="statement.pdf", + statement_year=2026, + ) + row = { + "Date": "", + "Details": "Date Details Debit Credit Balance", + "Debit €": "", + "Credit €": "", + "Balance €": "", + } + proc.process(row, "") + assert "statement_year" not in row diff --git a/packages/parser-core/tests/services/test_date_parser.py b/packages/parser-core/tests/services/test_date_parser.py new file mode 100644 index 0000000..8792a68 --- /dev/null +++ b/packages/parser-core/tests/services/test_date_parser.py @@ -0,0 +1,126 @@ +"""Tests for DateParserService — yearless date parsing and hint_year support.""" + +from __future__ import annotations + +from datetime import datetime + +import pytest + +from bankstatements_core.services.date_parser import DateParserService + + +@pytest.fixture() +def service() -> DateParserService: + return DateParserService() + + +class TestParseYearlessDate: + """Tests for _parse_yearless_date with hint_year.""" + + def test_abbreviated_month_with_hint_year(self, service): + result = service._parse_yearless_date("3 Feb", 2026) + assert result == datetime(2026, 2, 3) + + def test_full_month_name_with_hint_year(self, service): + result = service._parse_yearless_date("3 February", 2026) + assert result == datetime(2026, 2, 3) + + def test_single_digit_day(self, service): + result = service._parse_yearless_date("5 Jan", 2025) + assert result == datetime(2025, 1, 5) + + def test_two_digit_day(self, service): + result = service._parse_yearless_date("18 Mar", 2026) + assert result == datetime(2026, 3, 18) + + def test_returns_none_when_no_hint_year(self, service): + assert service._parse_yearless_date("3 Feb", None) is None + + def test_returns_none_for_non_yearless_format(self, service): + assert service._parse_yearless_date("01/02/2026", 2026) is None + + def test_hint_year_overrides_default(self, service): + result = service._parse_yearless_date("1 Dec", 2024) + assert result is not None + assert result.year == 2024 + + def test_all_months_abbreviated(self, service): + months = [ + ("Jan", 1), + ("Feb", 2), + ("Mar", 3), + ("Apr", 4), + ("May", 5), + ("Jun", 6), + ("Jul", 7), + ("Aug", 8), + ("Sep", 9), + ("Oct", 10), + ("Nov", 11), + ("Dec", 12), + ] + for abbr, month_num in months: + result = service._parse_yearless_date(f"1 {abbr}", 2026) + assert result is not None, f"Failed to parse '1 {abbr}'" + assert result.month == month_num + + +class TestParseTransactionDateWithHintYear: + """Tests for parse_transaction_date with hint_year for yearless dates.""" + + def test_yearless_date_resolved_with_hint(self, service): + result = service.parse_transaction_date("3 Feb", hint_year=2026) + assert result == datetime(2026, 2, 3) + + def test_yearless_date_returns_epoch_without_hint(self, service): + result = service.parse_transaction_date("3 Feb") + assert result == service.EPOCH_DATE + + def test_full_month_name_yearless_with_hint(self, service): + result = service.parse_transaction_date("18 February", hint_year=2026) + assert result == datetime(2026, 2, 18) + + def test_dated_format_ignores_hint_year(self, service): + # Dates with year component should not be affected by hint_year + result = service.parse_transaction_date("01/02/2023", hint_year=2026) + assert result == datetime(2023, 2, 1) + + def test_dd_mmm_yyyy_ignores_hint_year(self, service): + result = service.parse_transaction_date("25 Apr 2025", hint_year=2026) + assert result == datetime(2025, 4, 25) + + def test_empty_string_returns_epoch(self, service): + assert service.parse_transaction_date("", hint_year=2026) == service.EPOCH_DATE + + def test_unparseable_string_returns_epoch(self, service): + assert ( + service.parse_transaction_date("not-a-date", hint_year=2026) + == service.EPOCH_DATE + ) + + def test_yearless_date_logs_no_warning_when_hint_provided(self, service, caplog): + import logging + + with caplog.at_level(logging.WARNING): + service.parse_transaction_date("3 Feb", hint_year=2026) + assert "Unable to parse date" not in caplog.text + + def test_yearless_date_logs_warning_without_hint(self, service, caplog): + import logging + + with caplog.at_level(logging.WARNING): + service.parse_transaction_date("3 Feb") + assert "Unable to parse date '3 Feb'" in caplog.text + + +class TestYearlessDateFormats: + """Verify YEARLESS_DATE_FORMATS constant is correctly defined.""" + + def test_yearless_formats_defined(self, service): + assert "%d %b" in service.YEARLESS_DATE_FORMATS + assert "%d %B" in service.YEARLESS_DATE_FORMATS + + def test_yearless_formats_not_in_main_formats(self, service): + # Yearless formats must NOT appear in DATE_FORMATS to avoid ambiguity + assert "%d %b" not in service.DATE_FORMATS + assert "%d %B" not in service.DATE_FORMATS diff --git a/packages/parser-core/tests/services/test_row_merger_integration.py b/packages/parser-core/tests/services/test_row_merger_integration.py index 328256e..80e1d50 100644 --- a/packages/parser-core/tests/services/test_row_merger_integration.py +++ b/packages/parser-core/tests/services/test_row_merger_integration.py @@ -356,3 +356,90 @@ def test_metadata_rows_kept_separate(self, service, columns): assert len(result) == 2 assert result[0]["Balance €"] == "500.00" assert result[1]["Balance €"] == "400.00" + + def test_aib_cc_ref_continuation_line_merged(self, service): + """AIB CC: Ref: continuation line merges into preceding transaction. + + The AIB Credit Card PDF splits each transaction across two physical lines: + - Line 1: Transaction Date | Posting Date | Transaction Details | Amount + - Line 2: same Transaction Date | empty | Ref: | empty + + Without the RefContinuationClassifier, Line 2 is classified as 'transaction' + (it has a date) and emitted as a phantom empty row. With the fix, it is + classified as 'continuation' and merged into Line 1's description. + """ + cc_columns = { + "Transaction Date": (29, 80), + "Posting Date": (80, 118), + "Transaction Details": (118, 370), + "Amount": (370, 430), + } + rows = [ + { + "Transaction Date": "4 Feb", + "Posting Date": "5 Feb", + "Transaction Details": "PAYPAL *CLEVERBRIDG 35314369001 DE", + "Amount": "84.54", + }, + { + # AIB CC repeats the transaction date on the Ref line + "Transaction Date": "4 Feb", + "Posting Date": "", + "Transaction Details": "Ref: 9876543210", + "Amount": "", + }, + ] + + result = service.merge_continuation_lines(rows, cc_columns) + + # Should collapse to 1 row (the Ref line is merged, not emitted separately) + assert len(result) == 1 + assert result[0]["Transaction Date"] == "4 Feb" + assert result[0]["Amount"] == "84.54" + # Ref text is appended to the description + assert "Ref: 9876543210" in result[0]["Transaction Details"] + + def test_aib_cc_date_only_split_row_merged(self, service): + """AIB CC: date-only row caused by Y-split merges into the next transaction. + + Some AIB CC transactions have the Transaction Date at a slightly different + Y-coordinate than the Posting Date / Details / Amount, causing RowBuilder + to emit a standalone date-only row followed by a dateless transaction row. + The merger should carry the date forward and collapse them into one row. + """ + cc_columns = { + "Transaction Date": (29, 80), + "Posting Date": (80, 118), + "Transaction Details": (118, 370), + "Amount": (370, 430), + } + rows = [ + # Date-only split row (Transaction Date word at different Y) + { + "Transaction Date": "4 Feb", + "Posting Date": "", + "Transaction Details": "", + "Amount": "", + }, + # Main transaction row (no Transaction Date, rest of fields present) + { + "Transaction Date": "", + "Posting Date": "5 Feb", + "Transaction Details": "PAYPAL *STRAVA INC 4029357733 US", + "Amount": "59.99", + }, + # Ref continuation + { + "Transaction Date": "", + "Posting Date": "", + "Transaction Details": "Ref: 24036036035604120333083", + "Amount": "", + }, + ] + + result = service.merge_continuation_lines(rows, cc_columns) + + assert len(result) == 1 + assert result[0]["Transaction Date"] == "4 Feb" + assert result[0]["Amount"] == "59.99" + assert "PAYPAL *STRAVA INC" in result[0]["Transaction Details"] diff --git a/packages/parser-core/tests/services/test_sorting_service.py b/packages/parser-core/tests/services/test_sorting_service.py index e312fee..13d49b1 100644 --- a/packages/parser-core/tests/services/test_sorting_service.py +++ b/packages/parser-core/tests/services/test_sorting_service.py @@ -14,6 +14,13 @@ def _tx(date: str, details: str) -> Transaction: return Transaction.from_dict({"Date": date, "Details": details}) +def _tx_with_year(date: str, details: str, statement_year: int) -> Transaction: + """Create a transaction with a statement_year in additional_fields (as stamped by RowPostProcessor).""" + tx = Transaction.from_dict({"Date": date, "Details": details}) + tx.additional_fields["statement_year"] = str(statement_year) + return tx + + class TestChronologicalSortingStrategy: """Tests for ChronologicalSortingStrategy.""" @@ -207,3 +214,50 @@ def test_sort_large_dataset(self): # Verify first few are early dates assert "01 Jan" in sorted_txns[0].date or "02 Jan" in sorted_txns[0].date + + +class TestChronologicalSortingWithYearlessDates: + """Tests for yearless date sorting using statement_year from additional_fields.""" + + def test_yearless_dates_sorted_when_year_present(self): + strategy = ChronologicalSortingStrategy() + transactions = [ + _tx_with_year("18 Feb", "Later", 2026), + _tx_with_year("3 Feb", "Earlier", 2026), + _tx_with_year("25 Feb", "Latest", 2026), + ] + sorted_txns = strategy.sort(transactions) + assert sorted_txns[0].details == "Earlier" # 3 Feb + assert sorted_txns[1].details == "Later" # 18 Feb + assert sorted_txns[2].details == "Latest" # 25 Feb + + def test_yearless_dates_fall_to_epoch_without_year(self): + strategy = ChronologicalSortingStrategy() + transactions = [ + _tx("18 Feb", "No year A"), + _tx("3 Feb", "No year B"), + ] + # Without hint_year both parse to epoch — order undefined but no crash + sorted_txns = strategy.sort(transactions) + assert len(sorted_txns) == 2 + + def test_yearless_and_full_dates_mixed(self): + strategy = ChronologicalSortingStrategy() + transactions = [ + _tx_with_year("18 Feb", "CC yearless", 2026), + _tx("01/01/2026", "Bank full date"), + _tx_with_year("3 Feb", "CC earlier", 2026), + ] + sorted_txns = strategy.sort(transactions) + # 01 Jan 2026, 03 Feb 2026, 18 Feb 2026 + assert sorted_txns[0].details == "Bank full date" + assert sorted_txns[1].details == "CC earlier" + assert sorted_txns[2].details == "CC yearless" + + def test_invalid_statement_year_in_additional_fields_falls_to_epoch(self): + strategy = ChronologicalSortingStrategy() + tx = _tx("3 Feb", "Bad year") + tx.additional_fields["statement_year"] = "not-an-int" + # Should not raise — falls back to epoch + sorted_txns = strategy.sort([tx]) + assert len(sorted_txns) == 1