From 98db280edee782b3cba9fc549d034a243116e41f Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 9 Apr 2026 21:06:20 +0100 Subject: [PATCH] fix(#129): add aib_credit_card.json template with correct CC column boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without a credit_card_statement template, get_default_for_type() fell back to the global default (bank statement columns). RowBuilder then mapped CC PDF words to Date/Details/Debit/Credit/Balance — wrong x-boundaries — so RefContinuationClassifier saw empty description text and classified Ref lines as transactions, emitting phantom empty rows in CC output. Adds aib_credit_card.json with: - document_type: credit_card_statement (selected by TemplateDetector) - Correct column layout: Transaction Date, Posting Date, Transaction Details, Amount - Detection via header_keywords and column_headers Adds 3 regression tests in TestAIBCCTemplateColumnsFix covering: - get_default_for_type('credit_card_statement') returns aib_credit_card - Column names include 'Transaction Details' (required by RefContinuationClassifier) - Column x-boundaries match known AIB CC PDF layout --- .../templates/aib_credit_card.json | 37 ++++++++ .../templates/test_aib_template_detection.py | 85 +++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 packages/parser-core/src/bankstatements_core/templates/aib_credit_card.json diff --git a/packages/parser-core/src/bankstatements_core/templates/aib_credit_card.json b/packages/parser-core/src/bankstatements_core/templates/aib_credit_card.json new file mode 100644 index 0000000..6e59223 --- /dev/null +++ b/packages/parser-core/src/bankstatements_core/templates/aib_credit_card.json @@ -0,0 +1,37 @@ +{ + "id": "aib_credit_card", + "name": "AIB Ireland Credit Card Statement", + "document_type": "credit_card_statement", + "enabled": true, + "detection": { + "header_keywords": [ + "Allied Irish Banks", + "Credit Card Statement", + "Card Statement" + ], + "column_headers": [ + "Transaction Date", + "Posting Date", + "Transaction Details", + "Amount" + ] + }, + "extraction": { + "table_top_y": 300, + "table_bottom_y": 720, + "enable_header_check": true, + "header_check_top_y": 250, + "columns": { + "Transaction Date": [29, 80], + "Posting Date": [80, 118], + "Transaction Details": [118, 370], + "Amount": [370, 430] + } + }, + "processing": { + "supports_multiline": true, + "date_format": "%d %b", + "currency_symbol": "\u20ac", + "decimal_separator": "." + } +} diff --git a/packages/parser-core/tests/templates/test_aib_template_detection.py b/packages/parser-core/tests/templates/test_aib_template_detection.py index 00ea5d2..fc26e8c 100644 --- a/packages/parser-core/tests/templates/test_aib_template_detection.py +++ b/packages/parser-core/tests/templates/test_aib_template_detection.py @@ -298,5 +298,90 @@ def test_aib_credit_card_is_credit_card_statement(self): assert aib_cc.document_type == "credit_card_statement" +class TestAIBCCTemplateColumnsFix: + """Regression tests for issue #129 — CC PDF using wrong (bank) column layout. + + Without aib_credit_card.json, get_default_for_type("credit_card_statement") + falls back to the global default (bank statement columns). RowBuilder then + maps CC PDF words to Date/Details/Debit/Credit/Balance — none of which match + the CC column positions — so RefContinuationClassifier sees an empty description + and classifies Ref lines as transactions, emitting phantom rows. + """ + + def test_cc_default_template_uses_cc_columns(self): + """get_default_for_type('credit_card_statement') returns aib_credit_card. + + Before the fix, no credit_card_statement template existed and the call + fell back to the global default (bank statement columns), causing phantom + empty rows in CC output. + """ + registry = TemplateRegistry.from_default_config() + cc_template = registry.get_default_for_type("credit_card_statement") + + assert cc_template.document_type == "credit_card_statement", ( + "Default CC template must have document_type='credit_card_statement'; " + "falling back to a bank_statement template causes phantom rows in CC output" + ) + assert cc_template.id == "aib_credit_card" + + def test_cc_columns_include_transaction_details(self): + """aib_credit_card columns include 'Transaction Details'. + + RefContinuationClassifier uses ColumnTypeIdentifier to find the description + column by scanning column names for DESCRIPTION_PATTERNS. 'Transaction Details' + matches via 'detail' (or 'transaction'). If bank columns ('Details') are used + instead, the RefContinuationClassifier still works — but RowBuilder assigns + words to wrong positions because CC column x-boundaries differ from bank ones. + The CC template must define the correct x-boundaries for CC PDFs. + """ + registry = TemplateRegistry.from_default_config() + aib_cc = registry.get_template("aib_credit_card") + + assert aib_cc is not None + col_names = list(aib_cc.extraction.columns.keys()) + assert "Transaction Details" in col_names, ( + "CC template must have 'Transaction Details' column; " + "RefContinuationClassifier finds it via DESCRIPTION_PATTERNS" + ) + assert "Transaction Date" in col_names + assert "Posting Date" in col_names + assert "Amount" in col_names + + def test_cc_column_boundaries_match_aib_pdf_layout(self): + """aib_credit_card column x-boundaries match the known AIB CC PDF layout. + + These boundaries were established from test_row_merger_integration.py + cc_columns fixture which was derived from real AIB CC PDF analysis. + If the boundaries are wrong, RowBuilder assigns words to wrong columns + and RefContinuationClassifier sees empty description, classifying Ref + lines as transactions. + """ + registry = TemplateRegistry.from_default_config() + aib_cc = registry.get_template("aib_credit_card") + + assert aib_cc is not None + cols = aib_cc.extraction.columns + + txn_date = cols["Transaction Date"] + posting_date = cols["Posting Date"] + txn_details = cols["Transaction Details"] + amount = cols["Amount"] + + # Transaction Date: narrow left-most column + assert txn_date[0] < txn_date[1], "Transaction Date must have positive width" + # Posting Date immediately follows Transaction Date + assert ( + posting_date[0] >= txn_date[0] + ), "Posting Date starts at or after Txn Date" + # Transaction Details is the wide middle column + assert (txn_details[1] - txn_details[0]) > ( + txn_date[1] - txn_date[0] + ), "Transaction Details should be wider than Transaction Date" + # Amount is rightmost + assert ( + amount[0] >= txn_details[0] + ), "Amount must start after Transaction Details" + + if __name__ == "__main__": pytest.main([__file__, "-v"])