Skip to content

Commit 9cce247

Browse files
committed
2 parents da80d28 + 217c139 commit 9cce247

4 files changed

Lines changed: 517 additions & 57 deletions

File tree

app.py

Lines changed: 217 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
1818
from datetime import datetime
1919
from reportlab.pdfgen import canvas
20-
from extractors import index_and_extract
20+
from extractors import index_and_extract, extract_and_store_from_indexed
2121
from llm_sections import run_report_sections
2222
from db import (
2323
list_sections,
@@ -36,6 +36,8 @@
3636
get_latest_questionnaire_for_user,
3737
link_questionnaire_upload,
3838
list_questionnaire_uploads,
39+
insert_metric,
40+
delete_metrics_for_doc_keys,
3941
update_questionnaire_upload_metadata,
4042
)
4143
# --- Initialization ---
@@ -68,6 +70,90 @@ def _register_temp_download(data: bytes, filename: str, mimetype: str = "applica
6870
TEMP_REPORTS[token] = (filename, data, mimetype)
6971
return token
7072

73+
def _persist_metrics_for_doc(document_id: int, data: dict):
74+
"""
75+
Persist key numeric insights from extracted data into metrics for aggregation/prefill.
76+
Keys stored:
77+
- opening_balance, closing_balance, total_inflows, total_outflows (bank)
78+
- gross_total_income, taxable_income, total_tax_paid (ITR)
79+
- sum_assured_or_insured (insurance)
80+
- portfolio_equity/debt/gold/realEstate/insuranceLinked/cash (CAS allocation)
81+
"""
82+
if not isinstance(data, dict):
83+
return
84+
85+
def n(v):
86+
try:
87+
if isinstance(v, (int, float)):
88+
return float(v)
89+
if v in (None, "", "N/A"):
90+
return None
91+
cv = clean_and_convert_to_float(str(v))
92+
return cv if cv != "N/A" else None
93+
except Exception:
94+
return None
95+
96+
to_store = {}
97+
98+
# Bank account summary
99+
acct = data.get("account_summary") or {}
100+
for k in ["opening_balance", "closing_balance", "total_inflows", "total_outflows"]:
101+
v = acct.get(k)
102+
nv = n(v)
103+
if nv is not None:
104+
to_store[k] = nv
105+
106+
# ITR numbers (top-level or under tax_computation)
107+
itr_top = {k: data.get(k) for k in ["gross_total_income", "taxable_income", "total_tax_paid"]}
108+
tax_comp = data.get("tax_computation") or {}
109+
for k in ["gross_total_income", "taxable_income", "total_tax_paid"]:
110+
v = itr_top.get(k, None)
111+
if v in (None, "", "N/A"):
112+
v = tax_comp.get(k)
113+
nv = n(v)
114+
if nv is not None:
115+
to_store[k] = nv
116+
117+
# Insurance
118+
ins_sum = data.get("sum_assured_or_insured")
119+
ins_nv = n(ins_sum)
120+
if ins_nv is not None:
121+
to_store["sum_assured_or_insured"] = ins_nv
122+
123+
# CAS allocation
124+
alloc = data.get("asset_allocation") or {}
125+
mapping = {
126+
"equity_percentage": "portfolio_equity",
127+
"debt_percentage": "portfolio_debt",
128+
"gold_percentage": "portfolio_gold",
129+
"real_estate_percentage": "portfolio_realEstate",
130+
"insurance_linked_percentage": "portfolio_insuranceLinked",
131+
"cash_percentage": "portfolio_cash",
132+
# Support alternate keys if present
133+
"equity": "portfolio_equity",
134+
"debt": "portfolio_debt",
135+
"gold": "portfolio_gold",
136+
"realEstate": "portfolio_realEstate",
137+
"insuranceLinked": "portfolio_insuranceLinked",
138+
"cash": "portfolio_cash",
139+
}
140+
for src, dst in mapping.items():
141+
if src in alloc:
142+
nv = n(alloc.get(src))
143+
if nv is not None:
144+
to_store[dst] = nv
145+
146+
if to_store:
147+
try:
148+
delete_metrics_for_doc_keys(document_id, list(to_store.keys()))
149+
except Exception:
150+
pass
151+
for k, v in to_store.items():
152+
try:
153+
insert_metric(document_id, k, v, None)
154+
except Exception:
155+
continue
156+
71157
# --- Utility Function for Cleaning Numbers ---
72158
def clean_and_convert_to_float(value_str):
73159
"""Cleans currency symbols, commas, and parentheses, then converts to float."""
@@ -1638,88 +1724,112 @@ def analyze_financial_health(payload: dict):
16381724
# --- Document insights aggregation (from linked uploads) ---
16391725
def aggregate_doc_insights_for_questionnaire(qid: int) -> dict:
16401726
"""
1641-
Aggregate key metrics from all documents linked to the questionnaire.
1642-
Returns a dict with 'bank', 'portfolio', 'insurance', and 'itr' summaries when available.
1643-
Known metric keys mapped:
1644-
- Bank: total_inflows, total_outflows, opening_balance, closing_balance
1645-
- Portfolio: portfolio_equity, portfolio_debt, portfolio_gold, portfolio_realEstate, portfolio_insuranceLinked, portfolio_cash
1646-
- Insurance: insurance_sum_assured_or_insured (life/health), insurance_type
1647-
- ITR: gross_total_income, taxable_income, total_tax_paid
1727+
Aggregate insights using deterministic extractors from all documents linked to the questionnaire.
1728+
Combines:
1729+
- DB metrics (numeric aggregations for bank/CAS/ITR/insurance)
1730+
- Indexed section/table-driven summaries via extract_and_store_from_indexed(document_id)
1731+
Returns:
1732+
{
1733+
"bank": {...}, # totals + opening/closing + net_cashflow
1734+
"portfolio": {...}, # CAS allocation percentages if present
1735+
"insurance": {"sum_assured_or_insured": number}?,
1736+
"itr": {"gross_total_income": n, "taxable_income": n, "total_tax_paid": n}?,
1737+
"raw_extracts": [ # per-document extracted summaries from index
1738+
{
1739+
"document_id": int,
1740+
"summary": {
1741+
"investment_snapshot": {...} | None,
1742+
"account_summary": {...} | None,
1743+
"portfolio_summary": {...} | None,
1744+
"provenance": {...}
1745+
}
1746+
},
1747+
...
1748+
]
1749+
}
16481750
"""
16491751
uploads = list_questionnaire_uploads(qid) or []
1650-
doc_ids = [row["document_id"] for row in uploads if row.get("document_id") is not None]
1752+
doc_ids = [r["document_id"] for r in uploads if r["document_id"] is not None]
16511753

1754+
# Aggregates
16521755
bank = {"total_inflows": 0.0, "total_outflows": 0.0, "opening_balance": None, "closing_balance": None}
1653-
portfolio = {}
1756+
portfolio_alloc = {}
16541757
insurance = {}
16551758
itr = {}
16561759

1760+
per_doc_extracts = []
1761+
16571762
for did in doc_ids:
1763+
# 1) Deterministic summaries from indexed sections/tables
1764+
try:
1765+
idx_summary = extract_and_store_from_indexed(did) or {}
1766+
per_doc_extracts.append({"document_id": did, "summary": idx_summary})
1767+
# Merge account summary for opening/closing only to avoid double counting with metrics
1768+
acct = idx_summary.get("account_summary") or {}
1769+
try:
1770+
ob = acct.get("opening_balance")
1771+
cb = acct.get("closing_balance")
1772+
if bank["opening_balance"] is None and isinstance(ob, (int, float)):
1773+
bank["opening_balance"] = float(ob)
1774+
if bank["closing_balance"] is None and isinstance(cb, (int, float)):
1775+
bank["closing_balance"] = float(cb)
1776+
except Exception:
1777+
pass
1778+
# No direct allocation percentages in idx_summary; keep for narratives via facts
1779+
except Exception:
1780+
# If extraction fails for any doc, continue with metrics-only for that doc
1781+
pass
1782+
1783+
# 2) Numeric metrics for CAS allocation/ITR/insurance/bank (authoritative numeric store)
16581784
try:
16591785
mets = list_metrics(did) or []
16601786
except Exception:
16611787
mets = []
16621788
for m in mets:
1663-
k = (m["key"] or "").strip().lower()
1664-
vnum = m.get("value_num")
1665-
vtxt = m.get("value_text")
1666-
# Bank
1667-
if k in ("total_inflows", "total_outflows"):
1668-
try:
1669-
if k == "total_inflows" and vnum is not None:
1670-
bank["total_inflows"] += float(vnum)
1671-
if k == "total_outflows" and vnum is not None:
1672-
bank["total_outflows"] += float(vnum)
1673-
except Exception:
1674-
pass
1675-
elif k in ("opening_balance", "closing_balance"):
1676-
if vnum is not None:
1677-
try:
1789+
md = dict(m)
1790+
k = (md.get("key") or "").strip().lower()
1791+
vnum = md.get("value_num")
1792+
try:
1793+
if k in ("total_inflows", "total_outflows"):
1794+
if vnum is not None:
1795+
if k == "total_inflows":
1796+
bank["total_inflows"] += float(vnum)
1797+
else:
1798+
bank["total_outflows"] += float(vnum)
1799+
elif k in ("opening_balance", "closing_balance"):
1800+
if vnum is not None:
16781801
bank[k] = float(vnum)
1679-
except Exception:
1680-
pass
1681-
# Portfolio allocation (CAS)
1682-
elif k.startswith("portfolio_"):
1683-
try:
1684-
key = k.replace("portfolio_", "")
1802+
elif k.startswith("portfolio_"):
16851803
if vnum is not None:
1686-
portfolio[key] = float(vnum)
1687-
except Exception:
1688-
pass
1689-
# Insurance
1690-
elif k in ("insurance_sum_assured_or_insured", "sum_assured_or_insured"):
1691-
try:
1804+
portfolio_alloc[k.replace("portfolio_", "")] = float(vnum)
1805+
elif k in ("insurance_sum_assured_or_insured", "sum_assured_or_insured"):
16921806
if vnum is not None:
16931807
insurance["sum_assured_or_insured"] = float(vnum)
1694-
except Exception:
1695-
pass
1696-
elif k in ("insurance_type",):
1697-
if vtxt:
1698-
insurance["insurance_type"] = vtxt
1699-
# ITR
1700-
elif k in ("gross_total_income", "taxable_income", "total_tax_paid"):
1701-
try:
1808+
elif k in ("gross_total_income", "taxable_income", "total_tax_paid"):
17021809
if vnum is not None:
17031810
itr[k] = float(vnum)
1704-
except Exception:
1705-
pass
1811+
except Exception:
1812+
continue
17061813

1814+
# Compute net cashflow
17071815
try:
17081816
inflow = float(bank.get("total_inflows") or 0.0)
17091817
outflow = float(bank.get("total_outflows") or 0.0)
17101818
bank["net_cashflow"] = inflow - outflow
17111819
except Exception:
17121820
bank["net_cashflow"] = None
17131821

1714-
out = {}
1822+
out: Dict[str, dict] = {}
17151823
if any(v not in (None, 0.0) for v in bank.values()):
17161824
out["bank"] = bank
1717-
if portfolio:
1718-
out["portfolio"] = portfolio
1825+
if portfolio_alloc:
1826+
out["portfolio"] = portfolio_alloc
17191827
if insurance:
17201828
out["insurance"] = insurance
17211829
if itr:
17221830
out["itr"] = itr
1831+
if per_doc_extracts:
1832+
out["raw_extracts"] = per_doc_extracts
17231833
return out
17241834

17251835
def _get_cas_data_for_questionnaire(qid: int) -> dict:
@@ -1766,18 +1876,21 @@ def build_prefill_from_insights(qid: int) -> dict:
17661876

17671877
lifestyle = {}
17681878
try:
1769-
inflow = itr.get("gross_total_income")
1770-
if not isinstance(inflow, (int, float)) or inflow <= 0:
1771-
inflow = bank.get("total_inflows")
1772-
outflow = bank.get("total_outflows")
1773-
netcf = bank.get("net_cashflow")
1879+
# Annual income prefers ITR; monthly expenses from bank; savings% strictly from bank flows
1880+
itr_income = itr.get("gross_total_income")
1881+
bank_inflow = bank.get("total_inflows")
1882+
bank_outflow = bank.get("total_outflows")
1883+
1884+
inflow = itr_income if isinstance(itr_income, (int, float)) and itr_income > 0 else bank_inflow
1885+
outflow = bank_outflow
17741886

17751887
if isinstance(inflow, (int, float)) and inflow > 0:
17761888
lifestyle["annual_income"] = round(float(inflow), 2)
17771889
if isinstance(outflow, (int, float)) and outflow > 0:
17781890
lifestyle["monthly_expenses"] = round(float(outflow) / 12.0, 2)
1779-
if isinstance(inflow, (int, float)) and inflow > 0 and isinstance(outflow, (int, float)):
1780-
sp = max(0.0, (float(inflow) - float(outflow))) / float(inflow) * 100.0
1891+
if isinstance(bank_inflow, (int, float)) and bank_inflow > 0 and isinstance(bank_outflow, (int, float)):
1892+
sp = max(0.0, (float(bank_inflow) - float(bank_outflow))) / float(bank_inflow) * 100.0
1893+
sp = max(0.0, min(100.0, sp))
17811894
lifestyle["savings_percent"] = round(sp, 2)
17821895
except Exception:
17831896
pass
@@ -1923,6 +2036,10 @@ def upload_document():
19232036
bank_data["provenance"] = summaries["provenance"]
19242037
# Do not include raw transactions in PDF; attach summary only
19252038
extracted_data[f"{name} {idx+1}"] = bank_data
2039+
try:
2040+
_persist_metrics_for_doc(doc_id, bank_data)
2041+
except Exception as e:
2042+
print(f"Persist metrics (bank) failed: {e}")
19262043
else:
19272044
other_data = func(text)
19282045
# Merge DB-backed summaries if present (useful for CAS/Portfolio PDFs)
@@ -1954,6 +2071,10 @@ def upload_document():
19542071
print(f"Error updating CAS metadata: {e}")
19552072

19562073
extracted_data[f"{name} {idx+1}"] = other_data
2074+
try:
2075+
_persist_metrics_for_doc(doc_id, other_data)
2076+
except Exception as e:
2077+
print(f"Persist metrics ({doc_type}) failed: {e}")
19572078
else:
19582079
# Generic fallback: still return deterministic DB-backed summaries even if doc type is unknown
19592080
generic = {}
@@ -2272,10 +2393,46 @@ def _assemble_financial_inputs(q: dict, doc_insights=None) -> dict:
22722393
outflow = bank.get("total_outflows")
22732394
if isinstance(inflow, (int, float)) and inflow > 0 and isinstance(outflow, (int, float)):
22742395
sp = max(0.0, (inflow - outflow)) / inflow * 100.0
2396+
sp = max(0.0, min(100.0, sp))
22752397
payload["savings"]["savingsPercent"] = round(sp, 2)
22762398
except Exception:
22772399
pass
22782400

2401+
# Merge insurance covers from document insights when questionnaire values are absent
2402+
try:
2403+
ins = di.get("insurance") or {}
2404+
sum_val = ins.get("sum_assured_or_insured")
2405+
ins_type = str(ins.get("insurance_type") or "").lower()
2406+
if isinstance(sum_val, (int, float)) and sum_val > 0:
2407+
if not payload["insurance"].get("lifeCover") and ("life" in ins_type or "term" in ins_type or "ulip" in ins_type):
2408+
payload["insurance"]["lifeCover"] = float(sum_val)
2409+
if not payload["insurance"].get("healthCover") and ("health" in ins_type or "mediclaim" in ins_type):
2410+
payload["insurance"]["healthCover"] = float(sum_val)
2411+
# Unknown type: default to life cover if both missing
2412+
if not payload["insurance"].get("lifeCover") and not payload["insurance"].get("healthCover"):
2413+
payload["insurance"]["lifeCover"] = float(sum_val)
2414+
except Exception:
2415+
pass
2416+
2417+
# Merge portfolio allocation from document insights (CAS) when missing
2418+
try:
2419+
port = di.get("portfolio") or {}
2420+
alloc = payload.get("investments", {}).get("allocation") or {}
2421+
def _set_if_missing(key, src_key):
2422+
if alloc.get(key) in (None, "", 0):
2423+
v = port.get(src_key)
2424+
if isinstance(v, (int, float)) and v >= 0:
2425+
alloc[key] = float(v)
2426+
_set_if_missing("equity", "equity")
2427+
_set_if_missing("debt", "debt")
2428+
_set_if_missing("gold", "gold")
2429+
_set_if_missing("realEstate", "realEstate")
2430+
_set_if_missing("insuranceLinked", "insuranceLinked")
2431+
_set_if_missing("cash", "cash")
2432+
payload["investments"]["allocation"] = alloc
2433+
except Exception:
2434+
pass
2435+
22792436
return payload
22802437

22812438
def _build_client_facts(q: dict, analysis: dict, doc_insights=None) -> dict:
@@ -2321,8 +2478,11 @@ def _build_client_facts(q: dict, analysis: dict, doc_insights=None) -> dict:
23212478
"total_inflows": bank.get("total_inflows"),
23222479
"total_outflows": bank.get("total_outflows"),
23232480
"net_cashflow": bank.get("net_cashflow"),
2481+
"opening_balance": bank.get("opening_balance"),
2482+
"closing_balance": bank.get("closing_balance"),
23242483
},
23252484
"portfolio": portfolio,
2485+
"extracts": di.get("raw_extracts"),
23262486
"analysis": analysis,
23272487
}
23282488
return facts

output/index.db

0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)