|
17 | 17 | from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT |
18 | 18 | from datetime import datetime |
19 | 19 | from reportlab.pdfgen import canvas |
20 | | -from extractors import index_and_extract |
| 20 | +from extractors import index_and_extract, extract_and_store_from_indexed |
21 | 21 | from llm_sections import run_report_sections |
22 | 22 | from db import ( |
23 | 23 | list_sections, |
|
36 | 36 | get_latest_questionnaire_for_user, |
37 | 37 | link_questionnaire_upload, |
38 | 38 | list_questionnaire_uploads, |
| 39 | + insert_metric, |
| 40 | + delete_metrics_for_doc_keys, |
39 | 41 | update_questionnaire_upload_metadata, |
40 | 42 | ) |
41 | 43 | # --- Initialization --- |
@@ -68,6 +70,90 @@ def _register_temp_download(data: bytes, filename: str, mimetype: str = "applica |
68 | 70 | TEMP_REPORTS[token] = (filename, data, mimetype) |
69 | 71 | return token |
70 | 72 |
|
| 73 | +def _persist_metrics_for_doc(document_id: int, data: dict): |
| 74 | + """ |
| 75 | + Persist key numeric insights from extracted data into metrics for aggregation/prefill. |
| 76 | + Keys stored: |
| 77 | + - opening_balance, closing_balance, total_inflows, total_outflows (bank) |
| 78 | + - gross_total_income, taxable_income, total_tax_paid (ITR) |
| 79 | + - sum_assured_or_insured (insurance) |
| 80 | + - portfolio_equity/debt/gold/realEstate/insuranceLinked/cash (CAS allocation) |
| 81 | + """ |
| 82 | + if not isinstance(data, dict): |
| 83 | + return |
| 84 | + |
| 85 | + def n(v): |
| 86 | + try: |
| 87 | + if isinstance(v, (int, float)): |
| 88 | + return float(v) |
| 89 | + if v in (None, "", "N/A"): |
| 90 | + return None |
| 91 | + cv = clean_and_convert_to_float(str(v)) |
| 92 | + return cv if cv != "N/A" else None |
| 93 | + except Exception: |
| 94 | + return None |
| 95 | + |
| 96 | + to_store = {} |
| 97 | + |
| 98 | + # Bank account summary |
| 99 | + acct = data.get("account_summary") or {} |
| 100 | + for k in ["opening_balance", "closing_balance", "total_inflows", "total_outflows"]: |
| 101 | + v = acct.get(k) |
| 102 | + nv = n(v) |
| 103 | + if nv is not None: |
| 104 | + to_store[k] = nv |
| 105 | + |
| 106 | + # ITR numbers (top-level or under tax_computation) |
| 107 | + itr_top = {k: data.get(k) for k in ["gross_total_income", "taxable_income", "total_tax_paid"]} |
| 108 | + tax_comp = data.get("tax_computation") or {} |
| 109 | + for k in ["gross_total_income", "taxable_income", "total_tax_paid"]: |
| 110 | + v = itr_top.get(k, None) |
| 111 | + if v in (None, "", "N/A"): |
| 112 | + v = tax_comp.get(k) |
| 113 | + nv = n(v) |
| 114 | + if nv is not None: |
| 115 | + to_store[k] = nv |
| 116 | + |
| 117 | + # Insurance |
| 118 | + ins_sum = data.get("sum_assured_or_insured") |
| 119 | + ins_nv = n(ins_sum) |
| 120 | + if ins_nv is not None: |
| 121 | + to_store["sum_assured_or_insured"] = ins_nv |
| 122 | + |
| 123 | + # CAS allocation |
| 124 | + alloc = data.get("asset_allocation") or {} |
| 125 | + mapping = { |
| 126 | + "equity_percentage": "portfolio_equity", |
| 127 | + "debt_percentage": "portfolio_debt", |
| 128 | + "gold_percentage": "portfolio_gold", |
| 129 | + "real_estate_percentage": "portfolio_realEstate", |
| 130 | + "insurance_linked_percentage": "portfolio_insuranceLinked", |
| 131 | + "cash_percentage": "portfolio_cash", |
| 132 | + # Support alternate keys if present |
| 133 | + "equity": "portfolio_equity", |
| 134 | + "debt": "portfolio_debt", |
| 135 | + "gold": "portfolio_gold", |
| 136 | + "realEstate": "portfolio_realEstate", |
| 137 | + "insuranceLinked": "portfolio_insuranceLinked", |
| 138 | + "cash": "portfolio_cash", |
| 139 | + } |
| 140 | + for src, dst in mapping.items(): |
| 141 | + if src in alloc: |
| 142 | + nv = n(alloc.get(src)) |
| 143 | + if nv is not None: |
| 144 | + to_store[dst] = nv |
| 145 | + |
| 146 | + if to_store: |
| 147 | + try: |
| 148 | + delete_metrics_for_doc_keys(document_id, list(to_store.keys())) |
| 149 | + except Exception: |
| 150 | + pass |
| 151 | + for k, v in to_store.items(): |
| 152 | + try: |
| 153 | + insert_metric(document_id, k, v, None) |
| 154 | + except Exception: |
| 155 | + continue |
| 156 | + |
71 | 157 | # --- Utility Function for Cleaning Numbers --- |
72 | 158 | def clean_and_convert_to_float(value_str): |
73 | 159 | """Cleans currency symbols, commas, and parentheses, then converts to float.""" |
@@ -1638,88 +1724,112 @@ def analyze_financial_health(payload: dict): |
1638 | 1724 | # --- Document insights aggregation (from linked uploads) --- |
1639 | 1725 | def aggregate_doc_insights_for_questionnaire(qid: int) -> dict: |
1640 | 1726 | """ |
1641 | | - Aggregate key metrics from all documents linked to the questionnaire. |
1642 | | - Returns a dict with 'bank', 'portfolio', 'insurance', and 'itr' summaries when available. |
1643 | | - Known metric keys mapped: |
1644 | | - - Bank: total_inflows, total_outflows, opening_balance, closing_balance |
1645 | | - - Portfolio: portfolio_equity, portfolio_debt, portfolio_gold, portfolio_realEstate, portfolio_insuranceLinked, portfolio_cash |
1646 | | - - Insurance: insurance_sum_assured_or_insured (life/health), insurance_type |
1647 | | - - ITR: gross_total_income, taxable_income, total_tax_paid |
| 1727 | + Aggregate insights using deterministic extractors from all documents linked to the questionnaire. |
| 1728 | + Combines: |
| 1729 | + - DB metrics (numeric aggregations for bank/CAS/ITR/insurance) |
| 1730 | + - Indexed section/table-driven summaries via extract_and_store_from_indexed(document_id) |
| 1731 | + Returns: |
| 1732 | + { |
| 1733 | + "bank": {...}, # totals + opening/closing + net_cashflow |
| 1734 | + "portfolio": {...}, # CAS allocation percentages if present |
| 1735 | + "insurance": {"sum_assured_or_insured": number}?, |
| 1736 | + "itr": {"gross_total_income": n, "taxable_income": n, "total_tax_paid": n}?, |
| 1737 | + "raw_extracts": [ # per-document extracted summaries from index |
| 1738 | + { |
| 1739 | + "document_id": int, |
| 1740 | + "summary": { |
| 1741 | + "investment_snapshot": {...} | None, |
| 1742 | + "account_summary": {...} | None, |
| 1743 | + "portfolio_summary": {...} | None, |
| 1744 | + "provenance": {...} |
| 1745 | + } |
| 1746 | + }, |
| 1747 | + ... |
| 1748 | + ] |
| 1749 | + } |
1648 | 1750 | """ |
1649 | 1751 | uploads = list_questionnaire_uploads(qid) or [] |
1650 | | - doc_ids = [row["document_id"] for row in uploads if row.get("document_id") is not None] |
| 1752 | + doc_ids = [r["document_id"] for r in uploads if r["document_id"] is not None] |
1651 | 1753 |
|
| 1754 | + # Aggregates |
1652 | 1755 | bank = {"total_inflows": 0.0, "total_outflows": 0.0, "opening_balance": None, "closing_balance": None} |
1653 | | - portfolio = {} |
| 1756 | + portfolio_alloc = {} |
1654 | 1757 | insurance = {} |
1655 | 1758 | itr = {} |
1656 | 1759 |
|
| 1760 | + per_doc_extracts = [] |
| 1761 | + |
1657 | 1762 | for did in doc_ids: |
| 1763 | + # 1) Deterministic summaries from indexed sections/tables |
| 1764 | + try: |
| 1765 | + idx_summary = extract_and_store_from_indexed(did) or {} |
| 1766 | + per_doc_extracts.append({"document_id": did, "summary": idx_summary}) |
| 1767 | + # Merge account summary for opening/closing only to avoid double counting with metrics |
| 1768 | + acct = idx_summary.get("account_summary") or {} |
| 1769 | + try: |
| 1770 | + ob = acct.get("opening_balance") |
| 1771 | + cb = acct.get("closing_balance") |
| 1772 | + if bank["opening_balance"] is None and isinstance(ob, (int, float)): |
| 1773 | + bank["opening_balance"] = float(ob) |
| 1774 | + if bank["closing_balance"] is None and isinstance(cb, (int, float)): |
| 1775 | + bank["closing_balance"] = float(cb) |
| 1776 | + except Exception: |
| 1777 | + pass |
| 1778 | + # No direct allocation percentages in idx_summary; keep for narratives via facts |
| 1779 | + except Exception: |
| 1780 | + # If extraction fails for any doc, continue with metrics-only for that doc |
| 1781 | + pass |
| 1782 | + |
| 1783 | + # 2) Numeric metrics for CAS allocation/ITR/insurance/bank (authoritative numeric store) |
1658 | 1784 | try: |
1659 | 1785 | mets = list_metrics(did) or [] |
1660 | 1786 | except Exception: |
1661 | 1787 | mets = [] |
1662 | 1788 | for m in mets: |
1663 | | - k = (m["key"] or "").strip().lower() |
1664 | | - vnum = m.get("value_num") |
1665 | | - vtxt = m.get("value_text") |
1666 | | - # Bank |
1667 | | - if k in ("total_inflows", "total_outflows"): |
1668 | | - try: |
1669 | | - if k == "total_inflows" and vnum is not None: |
1670 | | - bank["total_inflows"] += float(vnum) |
1671 | | - if k == "total_outflows" and vnum is not None: |
1672 | | - bank["total_outflows"] += float(vnum) |
1673 | | - except Exception: |
1674 | | - pass |
1675 | | - elif k in ("opening_balance", "closing_balance"): |
1676 | | - if vnum is not None: |
1677 | | - try: |
| 1789 | + md = dict(m) |
| 1790 | + k = (md.get("key") or "").strip().lower() |
| 1791 | + vnum = md.get("value_num") |
| 1792 | + try: |
| 1793 | + if k in ("total_inflows", "total_outflows"): |
| 1794 | + if vnum is not None: |
| 1795 | + if k == "total_inflows": |
| 1796 | + bank["total_inflows"] += float(vnum) |
| 1797 | + else: |
| 1798 | + bank["total_outflows"] += float(vnum) |
| 1799 | + elif k in ("opening_balance", "closing_balance"): |
| 1800 | + if vnum is not None: |
1678 | 1801 | bank[k] = float(vnum) |
1679 | | - except Exception: |
1680 | | - pass |
1681 | | - # Portfolio allocation (CAS) |
1682 | | - elif k.startswith("portfolio_"): |
1683 | | - try: |
1684 | | - key = k.replace("portfolio_", "") |
| 1802 | + elif k.startswith("portfolio_"): |
1685 | 1803 | if vnum is not None: |
1686 | | - portfolio[key] = float(vnum) |
1687 | | - except Exception: |
1688 | | - pass |
1689 | | - # Insurance |
1690 | | - elif k in ("insurance_sum_assured_or_insured", "sum_assured_or_insured"): |
1691 | | - try: |
| 1804 | + portfolio_alloc[k.replace("portfolio_", "")] = float(vnum) |
| 1805 | + elif k in ("insurance_sum_assured_or_insured", "sum_assured_or_insured"): |
1692 | 1806 | if vnum is not None: |
1693 | 1807 | insurance["sum_assured_or_insured"] = float(vnum) |
1694 | | - except Exception: |
1695 | | - pass |
1696 | | - elif k in ("insurance_type",): |
1697 | | - if vtxt: |
1698 | | - insurance["insurance_type"] = vtxt |
1699 | | - # ITR |
1700 | | - elif k in ("gross_total_income", "taxable_income", "total_tax_paid"): |
1701 | | - try: |
| 1808 | + elif k in ("gross_total_income", "taxable_income", "total_tax_paid"): |
1702 | 1809 | if vnum is not None: |
1703 | 1810 | itr[k] = float(vnum) |
1704 | | - except Exception: |
1705 | | - pass |
| 1811 | + except Exception: |
| 1812 | + continue |
1706 | 1813 |
|
| 1814 | + # Compute net cashflow |
1707 | 1815 | try: |
1708 | 1816 | inflow = float(bank.get("total_inflows") or 0.0) |
1709 | 1817 | outflow = float(bank.get("total_outflows") or 0.0) |
1710 | 1818 | bank["net_cashflow"] = inflow - outflow |
1711 | 1819 | except Exception: |
1712 | 1820 | bank["net_cashflow"] = None |
1713 | 1821 |
|
1714 | | - out = {} |
| 1822 | + out: Dict[str, dict] = {} |
1715 | 1823 | if any(v not in (None, 0.0) for v in bank.values()): |
1716 | 1824 | out["bank"] = bank |
1717 | | - if portfolio: |
1718 | | - out["portfolio"] = portfolio |
| 1825 | + if portfolio_alloc: |
| 1826 | + out["portfolio"] = portfolio_alloc |
1719 | 1827 | if insurance: |
1720 | 1828 | out["insurance"] = insurance |
1721 | 1829 | if itr: |
1722 | 1830 | out["itr"] = itr |
| 1831 | + if per_doc_extracts: |
| 1832 | + out["raw_extracts"] = per_doc_extracts |
1723 | 1833 | return out |
1724 | 1834 |
|
1725 | 1835 | def _get_cas_data_for_questionnaire(qid: int) -> dict: |
@@ -1766,18 +1876,21 @@ def build_prefill_from_insights(qid: int) -> dict: |
1766 | 1876 |
|
1767 | 1877 | lifestyle = {} |
1768 | 1878 | try: |
1769 | | - inflow = itr.get("gross_total_income") |
1770 | | - if not isinstance(inflow, (int, float)) or inflow <= 0: |
1771 | | - inflow = bank.get("total_inflows") |
1772 | | - outflow = bank.get("total_outflows") |
1773 | | - netcf = bank.get("net_cashflow") |
| 1879 | + # Annual income prefers ITR; monthly expenses from bank; savings% strictly from bank flows |
| 1880 | + itr_income = itr.get("gross_total_income") |
| 1881 | + bank_inflow = bank.get("total_inflows") |
| 1882 | + bank_outflow = bank.get("total_outflows") |
| 1883 | + |
| 1884 | + inflow = itr_income if isinstance(itr_income, (int, float)) and itr_income > 0 else bank_inflow |
| 1885 | + outflow = bank_outflow |
1774 | 1886 |
|
1775 | 1887 | if isinstance(inflow, (int, float)) and inflow > 0: |
1776 | 1888 | lifestyle["annual_income"] = round(float(inflow), 2) |
1777 | 1889 | if isinstance(outflow, (int, float)) and outflow > 0: |
1778 | 1890 | lifestyle["monthly_expenses"] = round(float(outflow) / 12.0, 2) |
1779 | | - if isinstance(inflow, (int, float)) and inflow > 0 and isinstance(outflow, (int, float)): |
1780 | | - sp = max(0.0, (float(inflow) - float(outflow))) / float(inflow) * 100.0 |
| 1891 | + if isinstance(bank_inflow, (int, float)) and bank_inflow > 0 and isinstance(bank_outflow, (int, float)): |
| 1892 | + sp = max(0.0, (float(bank_inflow) - float(bank_outflow))) / float(bank_inflow) * 100.0 |
| 1893 | + sp = max(0.0, min(100.0, sp)) |
1781 | 1894 | lifestyle["savings_percent"] = round(sp, 2) |
1782 | 1895 | except Exception: |
1783 | 1896 | pass |
@@ -1923,6 +2036,10 @@ def upload_document(): |
1923 | 2036 | bank_data["provenance"] = summaries["provenance"] |
1924 | 2037 | # Do not include raw transactions in PDF; attach summary only |
1925 | 2038 | extracted_data[f"{name} {idx+1}"] = bank_data |
| 2039 | + try: |
| 2040 | + _persist_metrics_for_doc(doc_id, bank_data) |
| 2041 | + except Exception as e: |
| 2042 | + print(f"Persist metrics (bank) failed: {e}") |
1926 | 2043 | else: |
1927 | 2044 | other_data = func(text) |
1928 | 2045 | # Merge DB-backed summaries if present (useful for CAS/Portfolio PDFs) |
@@ -1954,6 +2071,10 @@ def upload_document(): |
1954 | 2071 | print(f"Error updating CAS metadata: {e}") |
1955 | 2072 |
|
1956 | 2073 | extracted_data[f"{name} {idx+1}"] = other_data |
| 2074 | + try: |
| 2075 | + _persist_metrics_for_doc(doc_id, other_data) |
| 2076 | + except Exception as e: |
| 2077 | + print(f"Persist metrics ({doc_type}) failed: {e}") |
1957 | 2078 | else: |
1958 | 2079 | # Generic fallback: still return deterministic DB-backed summaries even if doc type is unknown |
1959 | 2080 | generic = {} |
@@ -2272,10 +2393,46 @@ def _assemble_financial_inputs(q: dict, doc_insights=None) -> dict: |
2272 | 2393 | outflow = bank.get("total_outflows") |
2273 | 2394 | if isinstance(inflow, (int, float)) and inflow > 0 and isinstance(outflow, (int, float)): |
2274 | 2395 | sp = max(0.0, (inflow - outflow)) / inflow * 100.0 |
| 2396 | + sp = max(0.0, min(100.0, sp)) |
2275 | 2397 | payload["savings"]["savingsPercent"] = round(sp, 2) |
2276 | 2398 | except Exception: |
2277 | 2399 | pass |
2278 | 2400 |
|
| 2401 | + # Merge insurance covers from document insights when questionnaire values are absent |
| 2402 | + try: |
| 2403 | + ins = di.get("insurance") or {} |
| 2404 | + sum_val = ins.get("sum_assured_or_insured") |
| 2405 | + ins_type = str(ins.get("insurance_type") or "").lower() |
| 2406 | + if isinstance(sum_val, (int, float)) and sum_val > 0: |
| 2407 | + if not payload["insurance"].get("lifeCover") and ("life" in ins_type or "term" in ins_type or "ulip" in ins_type): |
| 2408 | + payload["insurance"]["lifeCover"] = float(sum_val) |
| 2409 | + if not payload["insurance"].get("healthCover") and ("health" in ins_type or "mediclaim" in ins_type): |
| 2410 | + payload["insurance"]["healthCover"] = float(sum_val) |
| 2411 | + # Unknown type: default to life cover if both missing |
| 2412 | + if not payload["insurance"].get("lifeCover") and not payload["insurance"].get("healthCover"): |
| 2413 | + payload["insurance"]["lifeCover"] = float(sum_val) |
| 2414 | + except Exception: |
| 2415 | + pass |
| 2416 | + |
| 2417 | + # Merge portfolio allocation from document insights (CAS) when missing |
| 2418 | + try: |
| 2419 | + port = di.get("portfolio") or {} |
| 2420 | + alloc = payload.get("investments", {}).get("allocation") or {} |
| 2421 | + def _set_if_missing(key, src_key): |
| 2422 | + if alloc.get(key) in (None, "", 0): |
| 2423 | + v = port.get(src_key) |
| 2424 | + if isinstance(v, (int, float)) and v >= 0: |
| 2425 | + alloc[key] = float(v) |
| 2426 | + _set_if_missing("equity", "equity") |
| 2427 | + _set_if_missing("debt", "debt") |
| 2428 | + _set_if_missing("gold", "gold") |
| 2429 | + _set_if_missing("realEstate", "realEstate") |
| 2430 | + _set_if_missing("insuranceLinked", "insuranceLinked") |
| 2431 | + _set_if_missing("cash", "cash") |
| 2432 | + payload["investments"]["allocation"] = alloc |
| 2433 | + except Exception: |
| 2434 | + pass |
| 2435 | + |
2279 | 2436 | return payload |
2280 | 2437 |
|
2281 | 2438 | def _build_client_facts(q: dict, analysis: dict, doc_insights=None) -> dict: |
@@ -2321,8 +2478,11 @@ def _build_client_facts(q: dict, analysis: dict, doc_insights=None) -> dict: |
2321 | 2478 | "total_inflows": bank.get("total_inflows"), |
2322 | 2479 | "total_outflows": bank.get("total_outflows"), |
2323 | 2480 | "net_cashflow": bank.get("net_cashflow"), |
| 2481 | + "opening_balance": bank.get("opening_balance"), |
| 2482 | + "closing_balance": bank.get("closing_balance"), |
2324 | 2483 | }, |
2325 | 2484 | "portfolio": portfolio, |
| 2485 | + "extracts": di.get("raw_extracts"), |
2326 | 2486 | "analysis": analysis, |
2327 | 2487 | } |
2328 | 2488 | return facts |
|
0 commit comments