From 95cb4b8eb86283d3ba599a8ab7eae5c5e506c3e2 Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 9 Apr 2026 14:53:39 +0100 Subject: [PATCH 1/4] fix(#129): eliminate phantom empty rows from AIB CC statements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs in RowMergerService caused empty rows in CC output: 1. Ref: continuation lines — AIB CC PDFs emit a 'Ref: ' line after each transaction. Without a specific classifier, TransactionClassifier picked these up as transactions (they have a date) and emitted phantom empty rows. Added RefContinuationClassifier (priority 3) to catch the Ref: pattern before TransactionClassifier runs. 2. Y-split date rows — some transactions have their Transaction Date word at a slightly different Y-coordinate, causing RowBuilder to split the transaction into a date-only row + a dateless row with the actual description/amount. Added date-only split detection in merge_continuation_lines: when a transaction row contains only date-column values and the next row is a transaction with no date, carry the date forward and collapse them. Tests: added TestRefContinuationClassifier unit tests, chain integration test for Ref: classification, and two RowMerger integration tests covering both patterns. --- .../extraction/row_classifiers.py | 46 ++++++++-- .../services/row_merger.py | 56 +++++++++++- .../tests/extraction/test_row_classifiers.py | 77 ++++++++++++++++ .../services/test_row_merger_integration.py | 87 +++++++++++++++++++ 4 files changed, 257 insertions(+), 9 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py b/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py index 2c44df6..d027450 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py +++ b/packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py @@ -218,6 +218,34 @@ def _do_classify( return None +class RefContinuationClassifier(RowClassifier): + """Classifies reference-number continuation lines (e.g. AIB CC 'Ref: 123456'). + + AIB Credit Card PDFs split each transaction across two physical lines: + - Line 1: Transaction Date | Posting Date | Details | Amount + - Line 2: Reference number only (e.g. "Ref: 123456") with the same date + repeated in the Transaction Date column + + Without this classifier, Line 2 is misclassified as a 'transaction' by + TransactionClassifier (it has a date) and emitted as an empty phantom row. + This classifier catches it before TransactionClassifier runs and marks it + as 'continuation' so RowMergerService merges it into the parent transaction. + """ + + _REF_PATTERN = re.compile(r"^Ref\s*:\s*\d+", re.IGNORECASE) + + def _do_classify( + self, row: dict, columns: dict[str, tuple[int | float, int | float]] + ) -> str | None: + """Detect reference-number continuation lines.""" + description_text = self._get_description_text(row, columns) + + if self._REF_PATTERN.match(description_text): + return "continuation" + + return None + + class FXContinuationClassifier(RowClassifier): """Classifies foreign exchange continuation lines.""" @@ -401,10 +429,11 @@ def create_row_classifier_chain() -> RowClassifier: 0 HeaderMetadataClassifier — column headers and field labels 1 AdministrativeClassifier — BALANCE FORWARD, Lending @ 2 ReferenceCodeClassifier — IE123456 patterns - 3 FXContinuationClassifier — FX rates, fees, exchange lines - 4 TimestampMetadataClassifier — 01JAN2023 TIME 14:30 - 5 TransactionClassifier — debit/credit/date combinations - 6 DefaultMetadataClassifier — catch-all fallback + 3 RefContinuationClassifier — Ref: 123456 continuation lines (AIB CC) + 4 FXContinuationClassifier — FX rates, fees, exchange lines + 5 TimestampMetadataClassifier — 01JAN2023 TIME 14:30 + 6 TransactionClassifier — debit/credit/date combinations + 7 DefaultMetadataClassifier — catch-all fallback Returns: The head of the classifier chain @@ -414,9 +443,10 @@ def create_row_classifier_chain() -> RowClassifier: (0, HeaderMetadataClassifier), (1, AdministrativeClassifier), (2, ReferenceCodeClassifier), - (3, FXContinuationClassifier), - (4, TimestampMetadataClassifier), - (5, TransactionClassifier), - (6, DefaultMetadataClassifier), + (3, RefContinuationClassifier), + (4, FXContinuationClassifier), + (5, TimestampMetadataClassifier), + (6, TransactionClassifier), + (7, DefaultMetadataClassifier), ] ).build_chain() diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index 96e48f0..d2984a5 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -33,7 +33,7 @@ def __init__(self) -> None: def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 # pylint: disable=too-many-branches - # Row merger heuristic — 17 branches reflect the full set of continuation + # Row merger heuristic — branches reflect the full set of continuation # line detection rules. Complexity is inherent to the domain logic. self, rows: list[dict], @@ -79,6 +79,31 @@ def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 row_type = self._classify_row_type(current_row, columns) if row_type == "transaction": + # Detect date-only split: row has a date but no description/amount. + # This happens when the PDF lays out the transaction date at a slightly + # different Y-coordinate than the rest of the row (e.g. AIB CC). + # In that case, carry the date into the next transaction row and skip + # the empty date-only row entirely. + if ( + date_col + and current_row.get(date_col, "").strip() + and not current_row.get(description_col, "").strip() + and self._is_date_only_row(current_row, columns) + and i + 1 < len(rows) + and self._classify_row_type(rows[i + 1], columns) == "transaction" + and not rows[i + 1].get(date_col, "").strip() + ): + # Carry this date into the next row and process that row instead + next_row = rows[i + 1].copy() + next_row[date_col] = current_row[date_col] + rows[i + 1] = next_row + logger.debug( + "Date-only split row: carried date '%s' into next row", + current_row[date_col], + ) + i += 1 + continue + # Look ahead for continuation lines continuation_parts = [] j = i + 1 @@ -151,6 +176,35 @@ def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 return merged_rows + def _is_date_only_row( + self, + row: dict, + columns: dict[str, tuple[int | float, int | float]], + ) -> bool: + """Return True if the row contains only date-column values and nothing else. + + Used to detect AIB CC Y-split rows where the Transaction Date lands at a + slightly different Y-coordinate than the Posting Date / Details / Amount, + causing RowBuilder to emit a standalone date-only row. + + Args: + row: Row dictionary to inspect + columns: Column definitions + + Returns: + True if all non-empty, non-Filename values belong to date-type columns + """ + from bankstatements_core.domain.column_types import ( # noqa: PLC0415 + get_type_as_string, + ) + + non_empty = { + k: v for k, v in row.items() if v.strip() and k != "Filename" + } + if not non_empty: + return False + return all(get_type_as_string(k) == "date" for k in non_empty) + def _classify_row_type( self, row: dict, columns: dict[str, tuple[int | float, int | float]] ) -> str: diff --git a/packages/parser-core/tests/extraction/test_row_classifiers.py b/packages/parser-core/tests/extraction/test_row_classifiers.py index b842bd2..4cbff07 100644 --- a/packages/parser-core/tests/extraction/test_row_classifiers.py +++ b/packages/parser-core/tests/extraction/test_row_classifiers.py @@ -10,6 +10,7 @@ DefaultMetadataClassifier, FXContinuationClassifier, HeaderMetadataClassifier, + RefContinuationClassifier, ReferenceCodeClassifier, RowClassifier, TimestampMetadataClassifier, @@ -258,6 +259,73 @@ def test_non_transaction(self): assert result is None +class TestRefContinuationClassifier: + """Tests for RefContinuationClassifier (AIB CC reference continuation lines).""" + + @pytest.fixture + def classifier(self): + return RefContinuationClassifier() + + @pytest.fixture + def cc_columns(self): + """CC-style columns (no amount col named 'Debit €').""" + return { + "Transaction Date": (29, 80), + "Posting Date": (80, 118), + "Transaction Details": (118, 370), + "Amount": (370, 430), + } + + def test_ref_colon_digits_classified_as_continuation(self, classifier, cc_columns): + """Ref: with no amount is continuation.""" + row = { + "Transaction Date": "4 Feb", + "Posting Date": "", + "Transaction Details": "Ref: 1234567890", + "Amount": "", + } + result = classifier._do_classify(row, cc_columns) + assert result == "continuation" + + def test_ref_no_space_classified_as_continuation(self, classifier, cc_columns): + """Ref: (no space after colon) is also continuation.""" + row = { + "Transaction Date": "4 Feb", + "Transaction Details": "Ref:9876543", + "Amount": "", + } + result = classifier._do_classify(row, cc_columns) + assert result == "continuation" + + def test_ref_with_amount_still_continuation(self, classifier, cc_columns): + """Ref: pattern always classified as continuation regardless of amount. + + The 'Amount' column on CC templates doesn't map to a typed debit/credit column, + so we match purely on the description pattern — Ref: is always a + reference continuation line in practice. + """ + row = { + "Transaction Date": "4 Feb", + "Transaction Details": "Ref: 1234567890", + "Amount": "50.00", + } + result = classifier._do_classify(row, cc_columns) + assert result == "continuation" + + def test_regular_transaction_not_matched(self, classifier): + """Normal transaction description not affected.""" + row = {"Date": "4 Feb", "Details": "PAYPAL *CLEVERBRIDG", "Debit €": "84.54"} + result = classifier._do_classify(row, TEST_COLUMNS) + assert result is None + + def test_reference_word_in_middle_not_matched(self, classifier): + """'Ref:' in the middle of a description is not matched (anchored to start).""" + row = {"Date": "", "Details": "Payment Ref: 123 extra", "Debit €": ""} + # Pattern requires digits immediately after Ref: with no other text + result = classifier._do_classify(row, TEST_COLUMNS) + assert result is None + + class TestDefaultMetadataClassifier: """Tests for DefaultMetadataClassifier.""" @@ -293,6 +361,15 @@ def test_chain_classifies_reference(self): result = chain.classify(row, TEST_COLUMNS) assert result == "reference" + def test_chain_classifies_ref_continuation(self): + """Test chain classifies AIB CC Ref: line as continuation (not transaction).""" + chain = create_row_classifier_chain() + # AIB CC: date repeats on the Ref line — without this classifier, + # TransactionClassifier would see the date and emit a phantom empty row. + row = {"Date": "4 Feb", "Details": "Ref: 1234567890", "Debit €": "", "Filename": "test"} + result = chain.classify(row, TEST_COLUMNS) + assert result == "continuation" + def test_chain_classifies_fx_continuation(self): """Test chain correctly classifies FX continuation.""" chain = create_row_classifier_chain() diff --git a/packages/parser-core/tests/services/test_row_merger_integration.py b/packages/parser-core/tests/services/test_row_merger_integration.py index 328256e..80e1d50 100644 --- a/packages/parser-core/tests/services/test_row_merger_integration.py +++ b/packages/parser-core/tests/services/test_row_merger_integration.py @@ -356,3 +356,90 @@ def test_metadata_rows_kept_separate(self, service, columns): assert len(result) == 2 assert result[0]["Balance €"] == "500.00" assert result[1]["Balance €"] == "400.00" + + def test_aib_cc_ref_continuation_line_merged(self, service): + """AIB CC: Ref: continuation line merges into preceding transaction. + + The AIB Credit Card PDF splits each transaction across two physical lines: + - Line 1: Transaction Date | Posting Date | Transaction Details | Amount + - Line 2: same Transaction Date | empty | Ref: | empty + + Without the RefContinuationClassifier, Line 2 is classified as 'transaction' + (it has a date) and emitted as a phantom empty row. With the fix, it is + classified as 'continuation' and merged into Line 1's description. + """ + cc_columns = { + "Transaction Date": (29, 80), + "Posting Date": (80, 118), + "Transaction Details": (118, 370), + "Amount": (370, 430), + } + rows = [ + { + "Transaction Date": "4 Feb", + "Posting Date": "5 Feb", + "Transaction Details": "PAYPAL *CLEVERBRIDG 35314369001 DE", + "Amount": "84.54", + }, + { + # AIB CC repeats the transaction date on the Ref line + "Transaction Date": "4 Feb", + "Posting Date": "", + "Transaction Details": "Ref: 9876543210", + "Amount": "", + }, + ] + + result = service.merge_continuation_lines(rows, cc_columns) + + # Should collapse to 1 row (the Ref line is merged, not emitted separately) + assert len(result) == 1 + assert result[0]["Transaction Date"] == "4 Feb" + assert result[0]["Amount"] == "84.54" + # Ref text is appended to the description + assert "Ref: 9876543210" in result[0]["Transaction Details"] + + def test_aib_cc_date_only_split_row_merged(self, service): + """AIB CC: date-only row caused by Y-split merges into the next transaction. + + Some AIB CC transactions have the Transaction Date at a slightly different + Y-coordinate than the Posting Date / Details / Amount, causing RowBuilder + to emit a standalone date-only row followed by a dateless transaction row. + The merger should carry the date forward and collapse them into one row. + """ + cc_columns = { + "Transaction Date": (29, 80), + "Posting Date": (80, 118), + "Transaction Details": (118, 370), + "Amount": (370, 430), + } + rows = [ + # Date-only split row (Transaction Date word at different Y) + { + "Transaction Date": "4 Feb", + "Posting Date": "", + "Transaction Details": "", + "Amount": "", + }, + # Main transaction row (no Transaction Date, rest of fields present) + { + "Transaction Date": "", + "Posting Date": "5 Feb", + "Transaction Details": "PAYPAL *STRAVA INC 4029357733 US", + "Amount": "59.99", + }, + # Ref continuation + { + "Transaction Date": "", + "Posting Date": "", + "Transaction Details": "Ref: 24036036035604120333083", + "Amount": "", + }, + ] + + result = service.merge_continuation_lines(rows, cc_columns) + + assert len(result) == 1 + assert result[0]["Transaction Date"] == "4 Feb" + assert result[0]["Amount"] == "59.99" + assert "PAYPAL *STRAVA INC" in result[0]["Transaction Details"] From fc1297305256332945ab1827c3f82eaff24e0ec2 Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 9 Apr 2026 15:24:20 +0100 Subject: [PATCH 2/4] fix(#132): sort CC transactions by inferring year from Payment Due date Yearless dates (e.g. "3 Feb") from AIB CC statements failed to parse, causing all transactions to fall back to epoch and sort in undefined order. - PageHeaderAnalyser.extract_statement_year(): scans full page 1 text for "Payment Due" / "Payment Due Date: DD Mon YYYY" and returns the year - ExtractionResult gains statement_year: int | None field - PDFTableExtractor.extract() restructured: pre-scans page 1 to extract card number and statement year before building the row processor; warns when year cannot be determined - RowPostProcessor stamps statement_year onto each transaction row so it flows into Transaction.additional_fields - DateParserService gains YEARLESS_DATE_FORMATS (%d %b, %d %B) and _parse_yearless_date(date_str, hint_year); parse_transaction_date() accepts optional hint_year parameter - ChronologicalSortingStrategy reads statement_year from additional_fields and passes it as hint_year to the date parser 39 new tests across test_date_parser.py, test_page_header_analyser.py, test_sorting_service.py, and test_row_post_processor.py --- .../domain/models/extraction_result.py | 5 + .../extraction/page_header_analyser.py | 37 +++++ .../extraction/pdf_extractor.py | 71 +++++----- .../extraction/row_post_processor.py | 4 + .../services/date_parser.py | 63 ++++++++- .../services/sorting_service.py | 17 ++- .../extraction/test_page_header_analyser.py | 50 +++++++ .../extraction/test_row_post_processor.py | 53 ++++++++ .../tests/services/test_date_parser.py | 126 ++++++++++++++++++ .../tests/services/test_sorting_service.py | 54 ++++++++ 10 files changed, 442 insertions(+), 38 deletions(-) create mode 100644 packages/parser-core/tests/services/test_date_parser.py diff --git a/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py b/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py index 087bbef..f0b22bc 100644 --- a/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py +++ b/packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py @@ -24,6 +24,10 @@ class ExtractionResult: card_number: Card number extracted from credit card statement header, or None for bank statements. Set to "unknown" when CC PDF is detected on paid tier but no card number pattern matches. + statement_year: Year inferred from a document-level date field (e.g. + "Payment Due Date: 3 Mar 2026" → 2026). Used to resolve yearless + transaction dates (e.g. "3 Feb") at sort time. None when the year + could not be determined from the PDF. """ transactions: list[Transaction] @@ -32,3 +36,4 @@ class ExtractionResult: source_file: Path warnings: list[ExtractionWarning] = field(default_factory=list) card_number: str | None = None + statement_year: int | None = None diff --git a/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py b/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py index 30bbdbe..4ea6713 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py +++ b/packages/parser-core/src/bankstatements_core/extraction/page_header_analyser.py @@ -25,6 +25,15 @@ _IBAN_HEADER_Y = 350 +# Patterns for extracting the statement year from a payment due date field. +# Matches lines like: +# "Payment Due 3 Mar 2026" +# "Payment Due Date: 20 Feb 2026" +_PAYMENT_DUE_PATTERNS = [ + r"Payment\s+Due\s+Date\s*[:\s]\s*\d{1,2}\s+\w+\s+(\d{4})", + r"Payment\s+Due\s+\d{1,2}\s+\w+\s+(\d{4})", +] + class PageHeaderAnalyser: """Inspects the page header area for credit card indicators and IBAN.""" @@ -57,6 +66,34 @@ def is_credit_card_statement(self, page: Any, table_top_y: int) -> bool: logger.warning("Error checking for credit card statement: %s", e) return False + def extract_statement_year(self, page: Any) -> int | None: + """Extract the statement year from a 'Payment Due' or 'Payment Due Date' field. + + Scans the full page 1 text for patterns like: + - "Payment Due 3 Mar 2026" + - "Payment Due Date: 20 Feb 2026" + + Args: + page: pdfplumber page object (page 1 only) + + Returns: + Four-digit year as int if found, None otherwise + """ + try: + page_text = page.extract_text() + if page_text: + for pattern in _PAYMENT_DUE_PATTERNS: + match = re.search(pattern, page_text, re.IGNORECASE) + if match: + year = int(match.group(1)) + logger.debug( + "Statement year %d extracted from 'Payment Due' field", year + ) + return year + except (AttributeError, ValueError, TypeError) as e: + logger.warning("Error extracting statement year from page: %s", e) + return None + def extract_iban(self, page: Any) -> str | None: """Extract account IBAN from the page header area (y < 350). diff --git a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py index 596a5b7..084d0fa 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py @@ -94,22 +94,15 @@ def extract(self, pdf_path: Path) -> ExtractionResult: rows: list[dict] = [] iban = None card_number: str | None = None - - filename_date = extract_filename_date(pdf_path.name) - page_processor = StatefulPageRowProcessor( - RowPostProcessor( - columns=self.columns, - row_classifier=self._row_classifier, - template=self.template, - filename_date=filename_date, - filename=pdf_path.name, - ) - ) + statement_year: int | None = None with self._pdf_reader.open(pdf_path) as pdf: - for page_num, page in enumerate(pdf.pages, 1): - if page_num == 1 and self._header_analyser.is_credit_card_statement( - page, self.table_top_y + # --- Page 1 pre-scan: gather document-level metadata before processing rows --- + if pdf.pages: + page1 = pdf.pages[0] + + if self._header_analyser.is_credit_card_statement( + page1, self.table_top_y ): if self._entitlements is None or self._entitlements.require_iban: logger.warning( @@ -129,27 +122,40 @@ def extract(self, pdf_path: Path) -> ExtractionResult: ], ) - if iban is None and page_num == 1: - iban = self._header_analyser.extract_iban(page) - if iban: - logger.info( - "IBAN found on page %s: %s****%s", - page_num, - iban[:4], - iban[-4:], + # Paid tier CC: extract card number and statement year up front + extracted = self._extract_card_number(page1) + card_number = extracted if extracted is not None else "unknown" + + statement_year = self._header_analyser.extract_statement_year(page1) + if statement_year is None: + logger.warning( + "Could not determine statement year from '%s'. " + "Yearless dates will not sort correctly.", + pdf_path.name, ) - # Extract card number on page 1 for CC statements (paid tier only) - if page_num == 1 and card_number is None: - extracted = self._extract_card_number(page) - if extracted is not None: - card_number = extracted - elif self._header_analyser.is_credit_card_statement( - page, self.table_top_y - ): - # CC PDF detected on paid tier but no card pattern matched - card_number = "unknown" + iban = self._header_analyser.extract_iban(page1) + if iban: + logger.info( + "IBAN found on page 1: %s****%s", + iban[:4], + iban[-4:], + ) + # Build page processor now that document-level metadata is known + filename_date = extract_filename_date(pdf_path.name) + page_processor = StatefulPageRowProcessor( + RowPostProcessor( + columns=self.columns, + row_classifier=self._row_classifier, + template=self.template, + filename_date=filename_date, + filename=pdf_path.name, + statement_year=statement_year, + ) + ) + + for page_num, page in enumerate(pdf.pages, 1): page_rows = self._extract_page(page, page_num) if page_rows is None: continue @@ -168,6 +174,7 @@ def extract(self, pdf_path: Path) -> ExtractionResult: iban=iban, source_file=pdf_path, card_number=card_number, + statement_year=statement_year, ) def _extract_card_number(self, page: Any) -> str | None: diff --git a/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py b/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py index ebb18b1..6a5c6a2 100644 --- a/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py +++ b/packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py @@ -58,6 +58,7 @@ def __init__( # noqa: PLR0913 filename_date: str, filename: str, scoring_config: ExtractionScoringConfig | None = None, + statement_year: int | None = None, ) -> None: self._columns = columns self._row_classifier = row_classifier @@ -69,6 +70,7 @@ def __init__( # noqa: PLR0913 if scoring_config is not None else ExtractionScoringConfig.default() ) + self._statement_year = statement_year self._date_col = ColumnTypeIdentifier.find_first_column_of_type(columns, "date") self._balance_col = ColumnTypeIdentifier.find_first_column_of_type( columns, "balance" @@ -147,6 +149,8 @@ def process(self, row: dict, current_date: str) -> str: # Metadata tagging row["Filename"] = self._filename + if self._statement_year is not None: + row["statement_year"] = str(self._statement_year) if self._template: row["document_type"] = self._template.document_type row["template_id"] = self._template.id diff --git a/packages/parser-core/src/bankstatements_core/services/date_parser.py b/packages/parser-core/src/bankstatements_core/services/date_parser.py index cee53a5..33ae383 100644 --- a/packages/parser-core/src/bankstatements_core/services/date_parser.py +++ b/packages/parser-core/src/bankstatements_core/services/date_parser.py @@ -43,7 +43,15 @@ class DateParserService: "%d%B%Y", # 01December2023 ] - def parse_transaction_date(self, date_str: str) -> datetime: + # Yearless date formats used by CC statements (e.g. AIB CC: "3 Feb") + YEARLESS_DATE_FORMATS = [ # noqa: RUF012 + "%d %b", # 3 Feb + "%d %B", # 3 February + ] + + def parse_transaction_date( + self, date_str: str, hint_year: int | None = None + ) -> datetime: """ Parse bank statement date string into datetime object. @@ -59,9 +67,13 @@ def parse_transaction_date(self, date_str: str) -> datetime: - DDMMMYY (e.g., "01DEC23") - DDMMMYYYY (e.g., "01DEC2023") - Partial dates: "DD/MM" (missing year) + - Yearless word dates: "3 Feb", "3 February" (requires hint_year) Args: date_str: Date string from bank statement + hint_year: Optional year inferred from a document-level field (e.g. + "Payment Due Date"). Used to resolve yearless dates like "3 Feb". + When None and the date is yearless, epoch is returned. Returns: datetime object, or epoch (1970-01-01) if unparseable @@ -78,6 +90,8 @@ def parse_transaction_date(self, date_str: str) -> datetime: datetime.datetime(2025, 4, 25, 0, 0) >>> service.parse_transaction_date("") datetime.datetime(1970, 1, 1, 0, 0) + >>> service.parse_transaction_date("3 Feb", hint_year=2026) + datetime.datetime(2026, 2, 3, 0, 0) """ # Handle empty or whitespace-only strings if not date_str or not date_str.strip(): @@ -89,11 +103,16 @@ def parse_transaction_date(self, date_str: str) -> datetime: # "Sept" -> "Sep" (Python's datetime uses 3-letter abbreviations) date_str = date_str.replace("Sept", "Sep") - # Try common date formats + # Try common date formats (all include a year component) parsed_date = self._parse_common_date_formats(date_str) if parsed_date is not None: return parsed_date + # Try yearless formats (e.g. "3 Feb" from CC statements) + parsed_date = self._parse_yearless_date(date_str, hint_year) + if parsed_date is not None: + return parsed_date + # Try partial date parsing (DD/MM without year) parsed_date = self._parse_partial_date(date_str) if parsed_date is not None: @@ -200,6 +219,46 @@ def _parse_common_date_formats(self, date_str: str) -> datetime | None: return None + def _parse_yearless_date( + self, date_str: str, hint_year: int | None + ) -> datetime | None: + """ + Parse yearless date strings like "3 Feb" or "3 February". + + Uses hint_year (from a document-level field such as "Payment Due Date") + to supply the missing year. Returns None when no hint_year is available + so the caller can fall through to epoch and log a warning. + + Args: + date_str: Date string without a year component + hint_year: Year to substitute, or None if unknown + + Returns: + Parsed datetime if format matches and hint_year is provided, None otherwise + + Examples: + >>> service = DateParserService() + >>> service._parse_yearless_date("3 Feb", 2026) + datetime.datetime(2026, 2, 3, 0, 0) + >>> service._parse_yearless_date("3 February", 2026) + datetime.datetime(2026, 2, 3, 0, 0) + >>> service._parse_yearless_date("3 Feb", None) + None + """ + if hint_year is None: + return None + + for fmt in self.YEARLESS_DATE_FORMATS: + # Append a fixed year so strptime never parses a year-free date + # (avoids Python 3.15 deprecation of yearless strptime). + augmented = f"{date_str} 1900" + augmented_fmt = f"{fmt} %Y" + parsed = self._try_parse_date_format(augmented, augmented_fmt) + if parsed is not None: + return parsed.replace(year=hint_year) + + return None + def _parse_partial_date(self, date_str: str) -> datetime | None: """ Parse partial date strings like "DD/MM" or "DD-MM" (missing year). diff --git a/packages/parser-core/src/bankstatements_core/services/sorting_service.py b/packages/parser-core/src/bankstatements_core/services/sorting_service.py index 9e5d0aa..f0cf530 100644 --- a/packages/parser-core/src/bankstatements_core/services/sorting_service.py +++ b/packages/parser-core/src/bankstatements_core/services/sorting_service.py @@ -6,8 +6,10 @@ from __future__ import annotations +import contextlib import logging from abc import ABC, abstractmethod +from datetime import datetime from typing import TYPE_CHECKING from bankstatements_core.services.date_parser import DateParserService @@ -55,10 +57,17 @@ def sort(self, transactions: list[Transaction]) -> list[Transaction]: logger.debug("Sorting %d transactions chronologically", len(transactions)) - return sorted( - transactions, - key=lambda tx: _date_parser_service.parse_transaction_date(tx.date), - ) + def _sort_key(tx: Transaction) -> datetime: + hint_year: int | None = None + raw = tx.additional_fields.get("statement_year") + if raw is not None: + with contextlib.suppress(ValueError): + hint_year = int(raw) + return _date_parser_service.parse_transaction_date( + tx.date, hint_year=hint_year + ) + + return sorted(transactions, key=_sort_key) class NoSortingStrategy(SortingStrategy): diff --git a/packages/parser-core/tests/extraction/test_page_header_analyser.py b/packages/parser-core/tests/extraction/test_page_header_analyser.py index c7a1f12..c9f2a15 100644 --- a/packages/parser-core/tests/extraction/test_page_header_analyser.py +++ b/packages/parser-core/tests/extraction/test_page_header_analyser.py @@ -114,3 +114,53 @@ def test_crops_at_fixed_350(self): analyser = PageHeaderAnalyser(mock_extractor) analyser.extract_iban(page) page.crop.assert_called_once_with((0, 0, page.width, 350)) + + +class TestExtractStatementYear: + """Tests for PageHeaderAnalyser.extract_statement_year.""" + + def _analyser(self) -> PageHeaderAnalyser: + return PageHeaderAnalyser(Mock()) + + def _make_full_page(self, full_text: str) -> Mock: + """Build a mock page where extract_text() returns full_text (no crop needed).""" + page = Mock() + page.extract_text.return_value = full_text + return page + + def test_payment_due_date_colon(self): + page = self._make_full_page("Payment Due Date: 20 Feb 2026\nSome other text") + assert self._analyser().extract_statement_year(page) == 2026 + + def test_payment_due_no_colon(self): + page = self._make_full_page("Payment Due 3 Mar 2026\nBalance: €0.00") + assert self._analyser().extract_statement_year(page) == 2026 + + def test_payment_due_date_different_year(self): + page = self._make_full_page("Payment Due Date: 1 Jan 2025") + assert self._analyser().extract_statement_year(page) == 2025 + + def test_case_insensitive(self): + page = self._make_full_page("PAYMENT DUE DATE: 15 Apr 2026") + assert self._analyser().extract_statement_year(page) == 2026 + + def test_returns_none_when_no_payment_due(self): + page = self._make_full_page("Statement Date: 01 Feb 2026\nBalance: €100.00") + assert self._analyser().extract_statement_year(page) is None + + def test_returns_none_for_empty_text(self): + page = self._make_full_page("") + assert self._analyser().extract_statement_year(page) is None + + def test_returns_none_for_none_text(self): + page = self._make_full_page(None) + assert self._analyser().extract_statement_year(page) is None + + def test_returns_none_on_page_exception(self): + page = Mock() + page.extract_text.side_effect = AttributeError("no text") + assert self._analyser().extract_statement_year(page) is None + + def test_payment_due_date_with_extra_whitespace(self): + page = self._make_full_page("Payment Due Date: 18 Feb 2026") + assert self._analyser().extract_statement_year(page) == 2026 diff --git a/packages/parser-core/tests/extraction/test_row_post_processor.py b/packages/parser-core/tests/extraction/test_row_post_processor.py index 5544d04..75a89a0 100644 --- a/packages/parser-core/tests/extraction/test_row_post_processor.py +++ b/packages/parser-core/tests/extraction/test_row_post_processor.py @@ -377,3 +377,56 @@ def test_bank_statement_document_type_unchanged(self): } proc.process(row, "") assert row["document_type"] == "bank_statement" + + def test_statement_year_stamped_on_transaction_row(self): + """statement_year is stamped as a string on each transaction row when provided.""" + proc = RowPostProcessor( + columns=TEST_COLUMNS, + row_classifier=_make_classifier("transaction"), + template=None, + filename_date="", + filename="statement.pdf", + statement_year=2026, + ) + row = { + "Date": "3 Feb", + "Details": "Purchase", + "Debit €": "", + "Credit €": "", + "Balance €": "", + } + proc.process(row, "") + assert row["statement_year"] == "2026" + + def test_statement_year_not_stamped_when_none(self): + """statement_year key absent from row when not provided.""" + proc = _make_processor() + row = { + "Date": "3 Feb", + "Details": "Purchase", + "Debit €": "", + "Credit €": "", + "Balance €": "", + } + proc.process(row, "") + assert "statement_year" not in row + + def test_statement_year_only_on_transaction_rows(self): + """statement_year is not stamped on non-transaction rows.""" + proc = RowPostProcessor( + columns=TEST_COLUMNS, + row_classifier=_make_classifier("header"), + template=None, + filename_date="", + filename="statement.pdf", + statement_year=2026, + ) + row = { + "Date": "", + "Details": "Date Details Debit Credit Balance", + "Debit €": "", + "Credit €": "", + "Balance €": "", + } + proc.process(row, "") + assert "statement_year" not in row diff --git a/packages/parser-core/tests/services/test_date_parser.py b/packages/parser-core/tests/services/test_date_parser.py new file mode 100644 index 0000000..8792a68 --- /dev/null +++ b/packages/parser-core/tests/services/test_date_parser.py @@ -0,0 +1,126 @@ +"""Tests for DateParserService — yearless date parsing and hint_year support.""" + +from __future__ import annotations + +from datetime import datetime + +import pytest + +from bankstatements_core.services.date_parser import DateParserService + + +@pytest.fixture() +def service() -> DateParserService: + return DateParserService() + + +class TestParseYearlessDate: + """Tests for _parse_yearless_date with hint_year.""" + + def test_abbreviated_month_with_hint_year(self, service): + result = service._parse_yearless_date("3 Feb", 2026) + assert result == datetime(2026, 2, 3) + + def test_full_month_name_with_hint_year(self, service): + result = service._parse_yearless_date("3 February", 2026) + assert result == datetime(2026, 2, 3) + + def test_single_digit_day(self, service): + result = service._parse_yearless_date("5 Jan", 2025) + assert result == datetime(2025, 1, 5) + + def test_two_digit_day(self, service): + result = service._parse_yearless_date("18 Mar", 2026) + assert result == datetime(2026, 3, 18) + + def test_returns_none_when_no_hint_year(self, service): + assert service._parse_yearless_date("3 Feb", None) is None + + def test_returns_none_for_non_yearless_format(self, service): + assert service._parse_yearless_date("01/02/2026", 2026) is None + + def test_hint_year_overrides_default(self, service): + result = service._parse_yearless_date("1 Dec", 2024) + assert result is not None + assert result.year == 2024 + + def test_all_months_abbreviated(self, service): + months = [ + ("Jan", 1), + ("Feb", 2), + ("Mar", 3), + ("Apr", 4), + ("May", 5), + ("Jun", 6), + ("Jul", 7), + ("Aug", 8), + ("Sep", 9), + ("Oct", 10), + ("Nov", 11), + ("Dec", 12), + ] + for abbr, month_num in months: + result = service._parse_yearless_date(f"1 {abbr}", 2026) + assert result is not None, f"Failed to parse '1 {abbr}'" + assert result.month == month_num + + +class TestParseTransactionDateWithHintYear: + """Tests for parse_transaction_date with hint_year for yearless dates.""" + + def test_yearless_date_resolved_with_hint(self, service): + result = service.parse_transaction_date("3 Feb", hint_year=2026) + assert result == datetime(2026, 2, 3) + + def test_yearless_date_returns_epoch_without_hint(self, service): + result = service.parse_transaction_date("3 Feb") + assert result == service.EPOCH_DATE + + def test_full_month_name_yearless_with_hint(self, service): + result = service.parse_transaction_date("18 February", hint_year=2026) + assert result == datetime(2026, 2, 18) + + def test_dated_format_ignores_hint_year(self, service): + # Dates with year component should not be affected by hint_year + result = service.parse_transaction_date("01/02/2023", hint_year=2026) + assert result == datetime(2023, 2, 1) + + def test_dd_mmm_yyyy_ignores_hint_year(self, service): + result = service.parse_transaction_date("25 Apr 2025", hint_year=2026) + assert result == datetime(2025, 4, 25) + + def test_empty_string_returns_epoch(self, service): + assert service.parse_transaction_date("", hint_year=2026) == service.EPOCH_DATE + + def test_unparseable_string_returns_epoch(self, service): + assert ( + service.parse_transaction_date("not-a-date", hint_year=2026) + == service.EPOCH_DATE + ) + + def test_yearless_date_logs_no_warning_when_hint_provided(self, service, caplog): + import logging + + with caplog.at_level(logging.WARNING): + service.parse_transaction_date("3 Feb", hint_year=2026) + assert "Unable to parse date" not in caplog.text + + def test_yearless_date_logs_warning_without_hint(self, service, caplog): + import logging + + with caplog.at_level(logging.WARNING): + service.parse_transaction_date("3 Feb") + assert "Unable to parse date '3 Feb'" in caplog.text + + +class TestYearlessDateFormats: + """Verify YEARLESS_DATE_FORMATS constant is correctly defined.""" + + def test_yearless_formats_defined(self, service): + assert "%d %b" in service.YEARLESS_DATE_FORMATS + assert "%d %B" in service.YEARLESS_DATE_FORMATS + + def test_yearless_formats_not_in_main_formats(self, service): + # Yearless formats must NOT appear in DATE_FORMATS to avoid ambiguity + assert "%d %b" not in service.DATE_FORMATS + assert "%d %B" not in service.DATE_FORMATS diff --git a/packages/parser-core/tests/services/test_sorting_service.py b/packages/parser-core/tests/services/test_sorting_service.py index e312fee..13d49b1 100644 --- a/packages/parser-core/tests/services/test_sorting_service.py +++ b/packages/parser-core/tests/services/test_sorting_service.py @@ -14,6 +14,13 @@ def _tx(date: str, details: str) -> Transaction: return Transaction.from_dict({"Date": date, "Details": details}) +def _tx_with_year(date: str, details: str, statement_year: int) -> Transaction: + """Create a transaction with a statement_year in additional_fields (as stamped by RowPostProcessor).""" + tx = Transaction.from_dict({"Date": date, "Details": details}) + tx.additional_fields["statement_year"] = str(statement_year) + return tx + + class TestChronologicalSortingStrategy: """Tests for ChronologicalSortingStrategy.""" @@ -207,3 +214,50 @@ def test_sort_large_dataset(self): # Verify first few are early dates assert "01 Jan" in sorted_txns[0].date or "02 Jan" in sorted_txns[0].date + + +class TestChronologicalSortingWithYearlessDates: + """Tests for yearless date sorting using statement_year from additional_fields.""" + + def test_yearless_dates_sorted_when_year_present(self): + strategy = ChronologicalSortingStrategy() + transactions = [ + _tx_with_year("18 Feb", "Later", 2026), + _tx_with_year("3 Feb", "Earlier", 2026), + _tx_with_year("25 Feb", "Latest", 2026), + ] + sorted_txns = strategy.sort(transactions) + assert sorted_txns[0].details == "Earlier" # 3 Feb + assert sorted_txns[1].details == "Later" # 18 Feb + assert sorted_txns[2].details == "Latest" # 25 Feb + + def test_yearless_dates_fall_to_epoch_without_year(self): + strategy = ChronologicalSortingStrategy() + transactions = [ + _tx("18 Feb", "No year A"), + _tx("3 Feb", "No year B"), + ] + # Without hint_year both parse to epoch — order undefined but no crash + sorted_txns = strategy.sort(transactions) + assert len(sorted_txns) == 2 + + def test_yearless_and_full_dates_mixed(self): + strategy = ChronologicalSortingStrategy() + transactions = [ + _tx_with_year("18 Feb", "CC yearless", 2026), + _tx("01/01/2026", "Bank full date"), + _tx_with_year("3 Feb", "CC earlier", 2026), + ] + sorted_txns = strategy.sort(transactions) + # 01 Jan 2026, 03 Feb 2026, 18 Feb 2026 + assert sorted_txns[0].details == "Bank full date" + assert sorted_txns[1].details == "CC earlier" + assert sorted_txns[2].details == "CC yearless" + + def test_invalid_statement_year_in_additional_fields_falls_to_epoch(self): + strategy = ChronologicalSortingStrategy() + tx = _tx("3 Feb", "Bad year") + tx.additional_fields["statement_year"] = "not-an-int" + # Should not raise — falls back to epoch + sorted_txns = strategy.sort([tx]) + assert len(sorted_txns) == 1 From 1f45f3f0c5abad2191eb09b720bc3c75f393481e Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 9 Apr 2026 15:35:02 +0100 Subject: [PATCH 3/4] refactor: decompose merge_continuation_lines to pass xenon C gate Extract _is_date_only_split, _collect_continuations, and _handle_orphaned_continuation helpers to reduce cyclomatic complexity of merge_continuation_lines from D (23) to B (8). Also fix pre-existing isort ordering in parser-free tests. --- .../services/row_merger.py | 181 ++++++++++-------- .../tests/extraction/test_row_classifiers.py | 7 +- 2 files changed, 102 insertions(+), 86 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index d2984a5..690579a 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -31,10 +31,7 @@ def __init__(self) -> None: """Initialize the row merger service.""" self._last_transaction_row: dict | None = None - def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 - # pylint: disable=too-many-branches - # Row merger heuristic — branches reflect the full set of continuation - # line detection rules. Complexity is inherent to the domain logic. + def merge_continuation_lines( self, rows: list[dict], columns: dict[str, tuple[int | float, int | float]], @@ -61,15 +58,13 @@ def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 if not rows: return rows - # Reset state for this batch self._last_transaction_row = None - # Find description and date columns description_col = find_first_column_of_type(columns, "description") date_col = find_first_column_of_type(columns, "date") if not description_col: - return rows # Can't merge without description column + return rows merged_rows = [] i = 0 @@ -79,103 +74,121 @@ def merge_continuation_lines( # noqa: C901, PLR0912, PLR0915 row_type = self._classify_row_type(current_row, columns) if row_type == "transaction": - # Detect date-only split: row has a date but no description/amount. - # This happens when the PDF lays out the transaction date at a slightly - # different Y-coordinate than the rest of the row (e.g. AIB CC). - # In that case, carry the date into the next transaction row and skip - # the empty date-only row entirely. - if ( - date_col - and current_row.get(date_col, "").strip() - and not current_row.get(description_col, "").strip() - and self._is_date_only_row(current_row, columns) - and i + 1 < len(rows) - and self._classify_row_type(rows[i + 1], columns) == "transaction" - and not rows[i + 1].get(date_col, "").strip() - ): - # Carry this date into the next row and process that row instead + if self._is_date_only_split(current_row, rows, i, date_col, columns): next_row = rows[i + 1].copy() - next_row[date_col] = current_row[date_col] + next_row[date_col] = current_row[date_col] # type: ignore[index] rows[i + 1] = next_row - logger.debug( - "Date-only split row: carried date '%s' into next row", - current_row[date_col], - ) + logger.debug("Date-only split row: carried date '%s' into next row", current_row[date_col]) # type: ignore[index] i += 1 continue - # Look ahead for continuation lines - continuation_parts = [] - j = i + 1 - - while j < len(rows): - next_row = rows[j] - next_type = self._classify_row_type(next_row, columns) - - if next_type == "continuation": - # Extract the continuation text - continuation_text = next_row.get(description_col, "").strip() - if continuation_text: - continuation_parts.append(continuation_text) - - # If this continuation line has a balance, preserve it - current_row = self._preserve_balance_from_continuation( - current_row, next_row, columns - ) - - j += 1 - elif next_type == "transaction": - # Found next transaction, stop looking for continuations - break - else: - # Other row types (administrative, etc.) - stop looking - break - - # Merge continuation parts into the main transaction description - if continuation_parts: - original_desc = current_row.get(description_col, "").strip() - merged_desc = original_desc + " " + " ".join(continuation_parts) - current_row[description_col] = merged_desc.strip() - - # Store current row as last transaction for date carry-forward + current_row, j = self._collect_continuations( + current_row, rows, i, description_col, columns + ) self._last_transaction_row = current_row.copy() - merged_rows.append(current_row) - i = j # Skip to after the last continuation line + i = j elif row_type == "continuation": - # Continuation line without preceding transaction - # Check if it's missing a date (date grouping pattern) - if date_col and self._last_transaction_row: - current_date = current_row.get(date_col, "").strip() - if not current_date: - # Carry forward date from last transaction - last_date = self._last_transaction_row.get(date_col, "").strip() - if last_date: - current_row[date_col] = last_date - logger.debug( - "Carried forward date '%s' to continuation row", - last_date, - ) - # Reclassify - it might be a transaction now - row_type = self._classify_row_type(current_row, columns) - + current_row, row_type = self._handle_orphaned_continuation( + current_row, row_type, date_col, columns + ) if row_type == "transaction": - # After date carry-forward, it's now a transaction self._last_transaction_row = current_row.copy() merged_rows.append(current_row) else: - # Still a continuation - skip orphaned line logger.warning("Orphaned continuation line: %s", current_row) i += 1 else: - # Non-transaction, non-continuation row - keep as is merged_rows.append(current_row) i += 1 return merged_rows + def _is_date_only_split( + self, + current_row: dict, + rows: list[dict], + i: int, + date_col: str | None, + columns: dict[str, tuple[int | float, int | float]], + ) -> bool: + """Return True when this row is a date-only PDF split that should be carried forward. + + Detects AIB CC Y-split rows where the transaction date lands at a slightly + different Y-coordinate, causing RowBuilder to emit a standalone date-only row. + """ + desc_col = find_first_column_of_type(columns, "description") + return bool( + date_col + and desc_col + and current_row.get(date_col, "").strip() + and not current_row.get(desc_col, "").strip() + and self._is_date_only_row(current_row, columns) + and i + 1 < len(rows) + and self._classify_row_type(rows[i + 1], columns) == "transaction" + and not rows[i + 1].get(date_col, "").strip() + ) + + def _collect_continuations( + self, + current_row: dict, + rows: list[dict], + i: int, + description_col: str, + columns: dict[str, tuple[int | float, int | float]], + ) -> tuple[dict, int]: + """Scan ahead and merge any continuation lines into current_row. + + Returns the updated row and the index of the next unprocessed row. + """ + continuation_parts: list[str] = [] + j = i + 1 + + while j < len(rows): + next_row = rows[j] + next_type = self._classify_row_type(next_row, columns) + + if next_type == "continuation": + text = next_row.get(description_col, "").strip() + if text: + continuation_parts.append(text) + current_row = self._preserve_balance_from_continuation( + current_row, next_row, columns + ) + j += 1 + else: + break + + if continuation_parts: + original_desc = current_row.get(description_col, "").strip() + current_row[description_col] = ( + original_desc + " " + " ".join(continuation_parts) + ).strip() + + return current_row, j + + def _handle_orphaned_continuation( + self, + current_row: dict, + row_type: str, + date_col: str | None, + columns: dict[str, tuple[int | float, int | float]], + ) -> tuple[dict, str]: + """Attempt to promote an orphaned continuation by carrying forward the last date.""" + if ( + date_col + and self._last_transaction_row + and not current_row.get(date_col, "").strip() + ): + last_date = self._last_transaction_row.get(date_col, "").strip() + if last_date: + current_row[date_col] = last_date + logger.debug("Carried forward date '%s' to continuation row", last_date) + row_type = self._classify_row_type(current_row, columns) + return current_row, row_type + def _is_date_only_row( self, row: dict, @@ -198,9 +211,7 @@ def _is_date_only_row( get_type_as_string, ) - non_empty = { - k: v for k, v in row.items() if v.strip() and k != "Filename" - } + non_empty = {k: v for k, v in row.items() if v.strip() and k != "Filename"} if not non_empty: return False return all(get_type_as_string(k) == "date" for k in non_empty) diff --git a/packages/parser-core/tests/extraction/test_row_classifiers.py b/packages/parser-core/tests/extraction/test_row_classifiers.py index 4cbff07..c42ea7d 100644 --- a/packages/parser-core/tests/extraction/test_row_classifiers.py +++ b/packages/parser-core/tests/extraction/test_row_classifiers.py @@ -366,7 +366,12 @@ def test_chain_classifies_ref_continuation(self): chain = create_row_classifier_chain() # AIB CC: date repeats on the Ref line — without this classifier, # TransactionClassifier would see the date and emit a phantom empty row. - row = {"Date": "4 Feb", "Details": "Ref: 1234567890", "Debit €": "", "Filename": "test"} + row = { + "Date": "4 Feb", + "Details": "Ref: 1234567890", + "Debit €": "", + "Filename": "test", + } result = chain.classify(row, TEST_COLUMNS) assert result == "continuation" From 92bcb5e30e067ade2c1da1744d232681686b925c Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 9 Apr 2026 15:37:29 +0100 Subject: [PATCH 4/4] fix: remove unused type: ignore comments flagged by mypy --- .../src/bankstatements_core/services/row_merger.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/services/row_merger.py b/packages/parser-core/src/bankstatements_core/services/row_merger.py index 690579a..049e8c1 100644 --- a/packages/parser-core/src/bankstatements_core/services/row_merger.py +++ b/packages/parser-core/src/bankstatements_core/services/row_merger.py @@ -76,9 +76,12 @@ def merge_continuation_lines( if row_type == "transaction": if self._is_date_only_split(current_row, rows, i, date_col, columns): next_row = rows[i + 1].copy() - next_row[date_col] = current_row[date_col] # type: ignore[index] + next_row[date_col] = current_row[date_col] rows[i + 1] = next_row - logger.debug("Date-only split row: carried date '%s' into next row", current_row[date_col]) # type: ignore[index] + logger.debug( + "Date-only split row: carried date '%s' into next row", + current_row[date_col], + ) i += 1 continue