Wat-Street · KenGao0216 · Jun 23, 2026
diff --git a/backend/app/ingestion/transcripts.py b/backend/app/ingestion/transcripts.py
@@ -5,6 +5,7 @@
 import re
 from typing import Any
 import pandas as pd
+from sqlalchemy.orm import Session
 
 from .ticker_config import load_tickers
 SOURCE = "defeatbeta"
@@ -44,6 +45,9 @@ def _classify_speaker_type(speaker: str, role: str, mgmt_speakers: set[str]) ->
 # transform
 def split_segments(df: pd.DataFrame) -> list[dict[str, Any]]:
     """Tag each transcript paragraph with its section, speaker identity, and Q&A exchange.
+
+    Segments are model-agnostic: one row per paragraph with raw text. chunking
+    is applied later in text_tokenizer
     """
     missing = [c for c in _REQUIRED_COLUMNS if c not in df.columns]
     if missing:
@@ -95,21 +99,16 @@ def build_record(
     ticker: str,
     fiscal_year: int,
     fiscal_quarter: int,
-    report_date: Any,
     transcript_df: pd.DataFrame,
-    source_transcript_id: Any = None,
 ) -> dict[str, Any]:
     if not 1 <= int(fiscal_quarter) <= 4:
         raise ValueError(f"fiscal_quarter must be 1-4, got {fiscal_quarter}")
 
     segments = split_segments(transcript_df)
-    report = pd.to_datetime(report_date, errors="coerce")
     return {
         "ticker": ticker,
         "quarter": f"{int(fiscal_year)}Q{int(fiscal_quarter)}",
         "source": SOURCE,
-        "source_transcript_id": None if source_transcript_id is None else str(source_transcript_id),
-        "report_date": None if pd.isna(report) else report.date().isoformat(),
         "segments": segments,
     }
 
@@ -142,14 +141,35 @@ def ingest(ticker: str) -> list[dict[str, Any]]:
         if year < MIN_FISCAL_YEAR:
             continue
         quarter = int(row["fiscal_quarter"])
-        report_date = row.get("report_date")
-        transcript_id = row.get("transcripts_id")
-        if transcript_id is not None and pd.isna(transcript_id):
-            transcript_id = None
         df = fetch_transcript(ticker, year, quarter)
-        records.append(build_record(ticker, year, quarter, report_date, df, transcript_id))
+        records.append(build_record(ticker, year, quarter, df))
     return records
 
 
 def ingest_all() -> dict[str, list[dict[str, Any]]]:
     return {ticker: ingest(ticker) for ticker in load_tickers()}
+
+
+def upsert_transcripts(session: Session, records: list[dict[str, Any]]) -> int:
+    """Upsert transcript records into the transcripts table keyed on (ticker, quarter)
+
+    Commits the session.  Returns the number of rows upserted.
+    """
+    if not records:
+        return 0
+
+    from sqlalchemy.dialects.postgresql import insert as pg_insert
+    from app.models.db import Transcript
+
+    stmt = pg_insert(Transcript).values(records)
+    stmt = stmt.on_conflict_do_update(
+        index_elements=["ticker", "quarter"],
+        set_={
+            "source": stmt.excluded.source,
+            "segments": stmt.excluded.segments,
+            "ingested_at": stmt.excluded.ingested_at,
+        },
+    )
+    session.execute(stmt)
+    session.commit()
+    return len(records)
diff --git a/backend/app/ml/features.py b/backend/app/ml/features.py
@@ -2,40 +2,70 @@
 # Do not reorder FEATURE_ORDER without updating MODALITY_GROUPS indices.
 
 FEATURE_ORDER = [
-    # fundamentals (indices 0–6)
+    # fundamentals (valuation)
     "fcf_yield", "trailing_pe", "forward_pe", "ev_ebitda",
-    "revenue_cagr_3y", "gross_margin", "net_debt",
-    # technical (indices 7–12)
+
+    #  fundamentals (quality)
+    "revenue_cagr_3y", "gross_margin", "net_debt_to_ebita", # to do: add to fundamentals.py
+
+    # technical
     "rsi_signal", "macd_signal", "breakout_score", "volume_surge",
-    "close", "volume",
-    # macro (indices 13–17)
-    "fed_funds_rate", "cpi_yoy", "unemployment", "treasury_10y", "yield_spread",
-    # options (indices 18–20)
+
+    # momentum
+    "return_21d", "return_63d", "return_126d", "market_relative_strength_63d",
+
+    #market risk / liquidity
+    "realized_volatility_21d", "beta_126d", "max_drawdown_63d", "dollar_volume_20d_log",
+
+    # macro - pending the FRED ingestion (macro.py still empty)
+    "fed_funds_rate", "fed_funds_change_3m", "cpi_yoy", "cpi_trend_3m", "unemployment", "unemployment_change_3m", 
+    "treasury_10y", "real_treasury_10y", "yield_spread",
+
+    # options 
     "put_call_ratio", "unusual_options_score", "gamma_exposure",
-    # sentiment (indices 21–25)
-    "mgmt_sentiment_score", "qa_sentiment_score", "news_sentiment",
+
+    # insider 
     "insider_buy_score", "net_insider_delta",
+
+    # sentiment 
+    "mgmt_sentiment_score", "qa_sentiment_score", "mgmt_qa_sentiment_gap", "transcript_sentiment_change_qoq",
+
+    #analyst / sells-side
+    "price_target_upside", "price_target_dispersion", "recommendation_score", "recent_rating_delta", "coverage_volume",
 ]
 
+MODALITY_FEATURES: dict[str, list[int]] = {
+    "valuation": ["fcf_yield", "trailing_pe", "forward_pe", "ev_ebita"], 
+    "quality": ["revenue_cagr_3y", "gross_margin", "net_debt_to_ebita"],
+    "technical": ["rsi_signal", "macd_signal", "breakout_score", "volume_surge"],
+    "momentum": [ "return_21d", "return_63d", "return_126d", "market_relative_strength_63d"],
+    "market_risk": ["realized_volatility_21d", "beta_126d", "max_drawdown_63d", "dollar_volume_20d_log"],
+    "macro": ["fed_funds_rate", "fed_funds_change_3m", "cpi_yoy", "cpi_trend_3m", "unemployment", "unemployment_change_3m", 
+    "treasury_10y", "real_treasury_10y", "yield_spread"],
+    "options": [ "put_call_ratio", "unusual_options_score", "gamma_exposure",],
+    "insider": ["insider_buy_score", "net_insider_delta"],
+    "transcript": ["mgmt_sentiment_score", "qa_sentiment_score", "mgmt_qa_sentiment_gap", "transcript_sentiment_change_qoq"],
+    "analyst": ["price_target_upside", "price_target_dispersion", "recommendation_score", "recent_rating_delta", "coverage_volume"]
+
+}
+_FEATURE_INDEX: dict[str, int] = {name: i for i, name in enumerate(FEATURE_ORDER)}
 MODALITY_GROUPS: dict[str, list[int]] = {
-    "fundamentals": [0, 1, 2, 3, 4, 5, 6],
-    "technical":    [7, 8, 9, 10, 11, 12],
-    "macro":        [13, 14, 15, 16, 17],
-    "options":      [18, 19, 20],
-    "sentiment":    [21, 22, 23, 24, 25],
+    modality: [_FEATURE_INDEX[name] for name in names]
+    for modality, names in MODALITY_FEATURES.items()
 }
-
 PERSONA_MODALITIES: dict[str, list[str]] = {
-    "value_fundamentalist":  ["fundamentals"],
-    "growth_visionary":      ["fundamentals"],
-    "quant_momentum":        ["technical"],
-    "technical_analyst":     ["technical"],
-    "macro_topdown":         ["macro", "fundamentals"],
-    "options_flow_trader":   ["options", "technical"],
-    "sentiment_trader":      ["sentiment"],
-    "insider_institutional": ["sentiment", "fundamentals"],
-    "dividend_income":       ["fundamentals"],
+    "value_fundamentalist":  ["valuation", "quality", "insider", "transcript"],
+    "growth_visionary":      ["valuation", "quality", "analyst", "transcript"],
+    "quant_momentum":        ["momemntum", "market_risk", "options"],
+    "technical_analyst":     ["technical", "momentum", "market_risk"],
+    "macro_topdown":         ["macro", "valuation", "momentum"],
+    "options_flow_trader":   ["options", "technical", "momentum"],
+    "sentiment_trader":      ["transcript", "analyst", "options", "momentum"],
+    "insider_institutional": ["insider", "valuation", "quality", "transcript"],
+    "dividend_income":       ["valuation", "quality", "macro", "transcript"],
 }
 
 N_FEATURES = len(FEATURE_ORDER)
-HORIZONS = [21, 63, 126, 252]  # trading days: 1M, 3M, 6M, 12M
+HORIZONS = [10, 21, 63, 126, 252]  # trading days: 2W, 1M, 3M, 6M, 12M
+
+#TO DO - update personal yamls
diff --git a/backend/app/ml/text_tokenizer.py b/backend/app/ml/text_tokenizer.py
@@ -0,0 +1,105 @@
+# FinBERT text tokenization for sentiment scoring
+# NOTE: This module tokenizes transcript *text*: it chunks paragraphs 
+# to fit FinBERT's 512-token window. 
+
+#TO-DO: still need to run FinBERT on chunks to get scores for dataset - Kiana
+
+from __future__ import annotations
+import re
+from typing import Any
+
+FINBERT_MODEL = "ProsusAI/finbert"
+MAX_TOKENS = 450
+
+# Prefer to break a chunk on a sentence boundary so a split never lands mid-clause when avoidable.
+_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+")
+
+_tokenizer = None
+
+
+def get_tokenizer():
+    """Lazily load and cache the FinBERT WordPiece tokenizer.
+    """
+    global _tokenizer
+    if _tokenizer is None:
+        from transformers import AutoTokenizer
+
+        _tokenizer = AutoTokenizer.from_pretrained(FINBERT_MODEL)
+    return _tokenizer
+
+
+def _count_tokens(text: str, tokenizer) -> int:
+    return len(tokenizer.encode(text, add_special_tokens=False))
+
+
+def _split_long_sentence(sentence: str, tokenizer, max_tokens: int) -> list[tuple[str, int]]:
+    """Hard-split a single sentence that on its own exceeds max_tokens, on token windows."""
+    ids = tokenizer.encode(sentence, add_special_tokens=False)
+    chunks: list[tuple[str, int]] = []
+    for start in range(0, len(ids), max_tokens):
+        window = ids[start:start + max_tokens]
+        chunks.append((tokenizer.decode(window).strip(), len(window)))
+    return chunks
+
+
+def chunk_text(text: str, tokenizer=None, max_tokens: int = MAX_TOKENS) -> list[tuple[str, int]]:
+    """Split text into chunks of at most max_tokens FinBERT tokens, preferring sentence breaks.
+
+    Returns (chunk_text, token_count) pairs.
+    """
+    if tokenizer is None:
+        tokenizer = get_tokenizer()
+
+    total = _count_tokens(text, tokenizer)
+    if total <= max_tokens:
+        return [(text, total)]
+
+    chunks: list[tuple[str, int]] = []
+    current: list[str] = []
+    current_tokens = 0
+    for sentence in _SENTENCE_RE.split(text):
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        n = _count_tokens(sentence, tokenizer)
+        if n > max_tokens:
+            # A single sentence larger than the window: flush what we have, then window-split it.
+            if current:
+                chunks.append((" ".join(current), current_tokens))
+                current, current_tokens = [], 0
+            chunks.extend(_split_long_sentence(sentence, tokenizer, max_tokens))
+            continue
+        if current_tokens + n > max_tokens:
+            chunks.append((" ".join(current), current_tokens))
+            current, current_tokens = [sentence], n
+        else:
+            current.append(sentence)
+            current_tokens += n
+    if current:
+        chunks.append((" ".join(current), current_tokens))
+    return chunks
+
+
+def chunk_segments(segments: list[dict[str, Any]], tokenizer=None) -> list[dict[str, Any]]:
+    """Expand transcript segments into FinBERT-sized scoring chunks.
+
+    Each input segment (one paragraph) yields one or more chunk dicts that carry the paragraph's identity and tags (paragraph, role, speaker,
+    speaker_type, qa_exchange) plus the chunk's `chunk` index within the paragraph, its `token_count`, and `text`.
+    """
+    if tokenizer is None:
+        tokenizer = get_tokenizer()
+
+    chunks: list[dict[str, Any]] = []
+    for seg in segments:
+        for chunk_index, (chunk, token_count) in enumerate(chunk_text(seg["text"], tokenizer)):
+            chunks.append({
+                "paragraph": seg["paragraph"],
+                "chunk": chunk_index,
+                "role": seg["role"],
+                "speaker": seg["speaker"],
+                "speaker_type": seg["speaker_type"],
+                "qa_exchange": seg["qa_exchange"],
+                "token_count": token_count,
+                "text": chunk,
+            })
+    return chunks
diff --git a/backend/tests/test_text_tokenizer.py b/backend/tests/test_text_tokenizer.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import pytest
+
+from app.ml import text_tokenizer
+from app.ml.text_tokenizer import MAX_TOKENS, chunk_segments, chunk_text
+
+
+class _FakeTokenizer:
+    """Whitespace stand-in for the FinBERT WordPiece tokenizer.
+    """
+
+    def encode(self, text: str, add_special_tokens: bool = False) -> list[str]:
+        return text.split()
+
+    def decode(self, ids: list[str]) -> str:
+        return " ".join(ids)
+
+
+@pytest.fixture(autouse=True)
+def _fake_tokenizer(monkeypatch):
+    monkeypatch.setattr(text_tokenizer, "get_tokenizer", lambda: _FakeTokenizer())
+
+
+# chunk_text
+def test_chunk_text_short_text_is_single_chunk():
+    chunks = chunk_text("We had a strong quarter with record revenue.")
+    assert len(chunks) == 1
+    text, token_count = chunks[0]
+    assert text == "We had a strong quarter with record revenue."
+    assert token_count == 8
+
+
+def test_chunk_text_window_splits_long_unbroken_text():
+    # No sentence breaks, longer than the window -> window-split into ordered chunks, nothing lost.
+    long_text = " ".join(["word"] * (MAX_TOKENS * 2 + 30))
+    chunks = chunk_text(long_text)
+    assert len(chunks) == 3
+    assert all(token_count <= MAX_TOKENS for _, token_count in chunks)
+    assert " ".join(text for text, _ in chunks) == long_text
+
+
+def test_chunk_text_packs_sentences_under_the_limit():
+    sentence = " ".join(["alpha"] * 90) + "."
+    paragraph = " ".join([sentence] * 8)  # 8 * ~91 tokens ~ 728 > MAX_TOKENS
+    chunks = chunk_text(paragraph)
+    assert len(chunks) >= 2
+    assert all(token_count <= MAX_TOKENS for _, token_count in chunks)
+
+
+def test_chunk_text_preserves_raw_text_verbatim():
+    text = "Margins?  We DON'T disclose that... yet."
+    chunks = chunk_text(text)
+    assert chunks == [(text, len(text.split()))]
+
+
+def test_chunk_text_accepts_injected_tokenizer():
+    chunks = chunk_text("one two three", tokenizer=_FakeTokenizer())
+    assert chunks == [("one two three", 3)]
+
+
+# chunk_segments
+def _segment(paragraph: int, text: str, **overrides) -> dict:
+    seg = {
+        "paragraph": paragraph,
+        "role": "qa",
+        "speaker": "Jane CEO",
+        "speaker_type": "executive",
+        "qa_exchange": 1,
+        "text": text,
+    }
+    seg.update(overrides)
+    return seg
+
+
+def test_chunk_segments_carries_tags_and_indexes_chunks():
+    long_text = " ".join(["word"] * (MAX_TOKENS + 5))
+    segments = [
+        _segment(1, "Short answer.", qa_exchange=1),
+        _segment(2, long_text, qa_exchange=2),
+    ]
+    chunks = chunk_segments(segments)
+
+    # Paragraph 1 stays one chunk; paragraph 2 splits into two.
+    by_para = {}
+    for c in chunks:
+        by_para.setdefault(c["paragraph"], []).append(c)
+    assert [c["chunk"] for c in by_para[1]] == [0]
+    assert [c["chunk"] for c in by_para[2]] == [0, 1]
+
+    # Tags from the source segment ride along onto every chunk.
+    assert by_para[2][0]["qa_exchange"] == 2
+    assert by_para[2][0]["speaker_type"] == "executive"
+    assert all(c["token_count"] <= MAX_TOKENS for c in chunks)
+    assert set(chunks[0].keys()) == {
+        "paragraph", "chunk", "role", "speaker", "speaker_type", "qa_exchange", "token_count", "text",
+    }