@@ -756,14 +756,39 @@ def extract_bank_statement_hybrid(text, transactions_payload=None, save_json_pat
756756 - average_monthly_balance
757757
758758 2. recurring_credits: Array of objects with description, amount, frequency, dates
759- 3. recurring_debits: Array of objects with description, amount, frequency, dates
759+
760+ 3. recurring_debits: Array of objects with:
761+ - description: Transaction description
762+ - amount: Numeric amount (no currency symbols)
763+ - frequency: Monthly/Quarterly/Yearly/Ad-hoc
764+ - dates: Array of dates when this recurring debit occurred
765+ - is_emi: Boolean - true if this is an EMI/loan payment, false otherwise
766+
767+ **EMI IDENTIFICATION RULES (IMPORTANT):**
768+ Mark is_emi=true for transactions that match ANY of these patterns:
769+ - Contains "EMI", "LOAN", "LN", "MORTGAGE", "INSTALMENT", "INSTALLMENT", "REPAYMENT"
770+ - Contains bank-specific loan codes: "ELM" (ICICI EMI), "HDFC LN", "SBI LN", "AXIS LN"
771+ - Contains "NACH" (National Automated Clearing House - often used for auto-debit EMIs/loans)
772+ - Contains "SI/" or "STANDING INSTRUCTION" for loan payments
773+ - Contains "ECS" (Electronic Clearing Service) for recurring loan debits
774+ - Contains "AUTO DEBIT" or "AUTODEBIT" for loan/EMI payments
775+ - Contains "HOME LOAN", "CAR LOAN", "PERSONAL LOAN", "VEHICLE LOAN", "HOUSING LOAN", "EDUCATION LOAN"
776+ - Same amount debited on or around the same date each month (±3 days) - likely an EMI
777+
778+ Common EMI transaction patterns in Indian banks:
779+ - ICICI: "BIL/INFT/ELM...", "NACH/..."
780+ - HDFC: "HDFC LN...", "NACH/HDFC..."
781+ - SBI: "EMI DED/...", "NACH/SBI..."
782+ - Axis: "AXIS LN...", "NACH/AXIS..."
783+
760784 4. high_value_transactions: Array with date, description, type, amount (threshold: 100000)
761785 5. bounce_penalty_charges: Array with date, description, amount
762786
763787 Notes:
764788 - Use the structured JSON transactions as the source of truth when present.
765789 - Use exact numeric values; do not include currency symbols.
766790 - Frequency can be Monthly/Quarterly/Yearly/Ad-hoc.
791+ - For recurring_debits, ALWAYS include the is_emi boolean field.
767792
768793 Structured Transactions JSON (optional):
769794 { tx_json_str if tx_json_str else "<none>" }
@@ -986,7 +1011,7 @@ def extract_insurance_hybrid(text):
9861011 data = {}
9871012
9881013 is_life_insurance = bool (re .search (r"(?i)(life\s+insurance|term\s+plan|endowment|ULIP|whole\s+life)" , text ))
989- is_health_insurance = bool (re .search (r"(?i)(health\s+insurance|mediclaim|medical\s+insurance|hospitali[sz]ation|health\s*cover|in-?patient\s+treatment|annual\s+sum\s+insured|sum\s+insured|health\s+advantedge|health\s+plan|health\s+policy)" , text ))
1014+ is_health_insurance = bool (re .search (r"(?i)(health\s+insurance|mediclaim|medical\s+insurance|hospitali[sz]ation|health\s*cover|in-?patient\s+treatment|annual\s+sum\s+insured|sum\s+insured|health\s+advantedge|health\s+plan|health\s+policy|medicare\s*premier|tata\s*aig\s*medicare )" , text ))
9901015 is_general_insurance = bool (re .search (r"(?i)(motor\s+insurance|vehicle\s+insurance|property\s+insurance|home\s+insurance|fire\s+insurance)" , text ))
9911016
9921017 # Debug logging
@@ -1006,7 +1031,8 @@ def extract_insurance_hybrid(text):
10061031 patterns = {
10071032 "policy_number" : [
10081033 r"(?i)Policy\s*(?:No\.?|Number)[\s:\-]*([A-Z0-9\-/]{6,25})" ,
1009- r"(?i)Policy[\s:\-]*([A-Z0-9\-/]{6,25})"
1034+ r"(?i)Policy[\s:\-]*([A-Z0-9\-/]{6,25})" ,
1035+ r"(?i)Member\s*(?:ID|No\.?)[\s:\-]*([A-Z0-9]{10,25})" # TATA AIG uses Member ID
10101036 ],
10111037 "insurer_name" : [
10121038 r"(?i)(?:Insurer|Company|Insurance\s+Company)[\s:\-]*([A-Za-z\s&]+?)(?:\n|Ltd|Limited|Insurance)" ,
@@ -1086,6 +1112,8 @@ def _parse_indian_amount(num_str, suffix_str=""):
10861112 ] if is_life_insurance else [
10871113 # Highest priority: "Annual Sum Insured" - this is the actual policy value
10881114 r"(?i)Annual\s+Sum\s+Insured[\s:\-\|]*(?:Rs\.?|₹)?\s*([\d,]+(?:\.\d+)?)\s*(Lakhs?|Lacs?|Crores?|Cr)?" ,
1115+ # TATA AIG format: "Sum Insured (₹)#" with value in same or next cell
1116+ r"(?i)Sum\s+Insured\s*\(?₹?\)?[#*]*[\s:\-\|]*([\d,]+(?:\.\d+)?)\s*(Lakhs?|Lacs?|Crores?|Cr)?" ,
10891117 # Next: general Sum Insured/Assured patterns
10901118 r"(?i)Sum\s*(?:Insured|Assured)[\s:\-]*(?:Rs\.?|₹)?\s*([\d,]+(?:\.\d+)?)\s*(Lakhs?|Lacs?|Crores?|Cr)?" ,
10911119 r"(?i)Cover(?:age)?\s*Amount[\s:\-]*(?:Rs\.?|₹)?\s*([\d,]+(?:\.\d+)?)\s*(Lakhs?|Lacs?|Crores?|Cr)?" ,
@@ -2321,7 +2349,21 @@ def build_prefill_from_insights(qid: int) -> dict:
23212349 try :
23222350 uploads = list_questionnaire_uploads (qid ) or []
23232351 total_monthly_emi = 0.0
2324- emi_keywords = ["emi" , "loan" , "mortgage" , "instalment" , "installment" , "repayment" , "home loan" , "car loan" , "personal loan" , "vehicle loan" , "housing loan" ]
2352+ # Expanded EMI keywords including bank-specific codes
2353+ emi_keywords = [
2354+ "emi" , "loan" , "ln" , "mortgage" , "instalment" , "installment" , "repayment" ,
2355+ "home loan" , "car loan" , "personal loan" , "vehicle loan" , "housing loan" , "education loan" ,
2356+ # Bank-specific codes
2357+ "elm" , # ICICI EMI/Loan
2358+ "nach" , # National Automated Clearing House (auto-debit EMIs)
2359+ "hdfc ln" , "sbi ln" , "icici ln" , "axis ln" , "kotak ln" , # Bank loan prefixes
2360+ "si/" , # Standing Instruction
2361+ "standing instruction" ,
2362+ "ecs" , # Electronic Clearing Service
2363+ "auto debit" , "autodebit" ,
2364+ "emi ded" , # SBI pattern
2365+ "bil/inft/elm" , # ICICI pattern
2366+ ]
23252367 for upload in uploads :
23262368 if (upload ["doc_type" ] or "" ).lower () == "bank statement" :
23272369 metadata_json = upload ["metadata_json" ]
@@ -2334,8 +2376,14 @@ def build_prefill_from_insights(qid: int) -> dict:
23342376 desc = (debit .get ("description" ) or "" ).lower ()
23352377 amount = debit .get ("amount" )
23362378 freq = (debit .get ("frequency" ) or "" ).lower ()
2337- # Check if this is an EMI payment
2338- if any (kw in desc for kw in emi_keywords ):
2379+ is_emi = debit .get ("is_emi" , False )
2380+
2381+ # Check if this is an EMI payment:
2382+ # 1. LLM marked it as EMI (is_emi=True)
2383+ # 2. OR description matches EMI keywords
2384+ is_emi_payment = is_emi or any (kw in desc for kw in emi_keywords )
2385+
2386+ if is_emi_payment :
23392387 if isinstance (amount , (int , float )) and amount > 0 :
23402388 # Convert to monthly if not already monthly
23412389 if freq in ("monthly" , "month" ):
@@ -2351,9 +2399,8 @@ def build_prefill_from_insights(qid: int) -> dict:
23512399 continue
23522400 if total_monthly_emi > 0 :
23532401 lifestyle ["monthly_emi" ] = round (total_monthly_emi , 2 )
2354- logger .debug (f"Prefill extracted monthly_emi: { lifestyle ['monthly_emi' ]} " )
23552402 except Exception as e :
2356- logger . warning ( f"Prefill error extracting monthly_emi: { e } " )
2403+ pass # Silently handle errors to avoid breaking prefill
23572404
23582405 allocation = {}
23592406 try :
0 commit comments