diff --git a/backend/app/ingestion/transcripts.py b/backend/app/ingestion/transcripts.py index 761e790..80045d3 100644 --- a/backend/app/ingestion/transcripts.py +++ b/backend/app/ingestion/transcripts.py @@ -5,6 +5,7 @@ import re from typing import Any import pandas as pd +from sqlalchemy.orm import Session from .ticker_config import load_tickers SOURCE = "defeatbeta" @@ -44,6 +45,9 @@ def _classify_speaker_type(speaker: str, role: str, mgmt_speakers: set[str]) -> # transform def split_segments(df: pd.DataFrame) -> list[dict[str, Any]]: """Tag each transcript paragraph with its section, speaker identity, and Q&A exchange. + + Segments are model-agnostic: one row per paragraph with raw text. chunking + is applied later in text_tokenizer """ missing = [c for c in _REQUIRED_COLUMNS if c not in df.columns] if missing: @@ -95,21 +99,16 @@ def build_record( ticker: str, fiscal_year: int, fiscal_quarter: int, - report_date: Any, transcript_df: pd.DataFrame, - source_transcript_id: Any = None, ) -> dict[str, Any]: if not 1 <= int(fiscal_quarter) <= 4: raise ValueError(f"fiscal_quarter must be 1-4, got {fiscal_quarter}") segments = split_segments(transcript_df) - report = pd.to_datetime(report_date, errors="coerce") return { "ticker": ticker, "quarter": f"{int(fiscal_year)}Q{int(fiscal_quarter)}", "source": SOURCE, - "source_transcript_id": None if source_transcript_id is None else str(source_transcript_id), - "report_date": None if pd.isna(report) else report.date().isoformat(), "segments": segments, } @@ -142,14 +141,35 @@ def ingest(ticker: str) -> list[dict[str, Any]]: if year < MIN_FISCAL_YEAR: continue quarter = int(row["fiscal_quarter"]) - report_date = row.get("report_date") - transcript_id = row.get("transcripts_id") - if transcript_id is not None and pd.isna(transcript_id): - transcript_id = None df = fetch_transcript(ticker, year, quarter) - records.append(build_record(ticker, year, quarter, report_date, df, transcript_id)) + records.append(build_record(ticker, year, quarter, df)) return records def ingest_all() -> dict[str, list[dict[str, Any]]]: return {ticker: ingest(ticker) for ticker in load_tickers()} + + +def upsert_transcripts(session: Session, records: list[dict[str, Any]]) -> int: + """Upsert transcript records into the transcripts table keyed on (ticker, quarter) + + Commits the session. Returns the number of rows upserted. + """ + if not records: + return 0 + + from sqlalchemy.dialects.postgresql import insert as pg_insert + from app.models.db import Transcript + + stmt = pg_insert(Transcript).values(records) + stmt = stmt.on_conflict_do_update( + index_elements=["ticker", "quarter"], + set_={ + "source": stmt.excluded.source, + "segments": stmt.excluded.segments, + "ingested_at": stmt.excluded.ingested_at, + }, + ) + session.execute(stmt) + session.commit() + return len(records) diff --git a/backend/app/ml/features.py b/backend/app/ml/features.py index 62fc75c..502e4ae 100644 --- a/backend/app/ml/features.py +++ b/backend/app/ml/features.py @@ -2,40 +2,70 @@ # Do not reorder FEATURE_ORDER without updating MODALITY_GROUPS indices. FEATURE_ORDER = [ - # fundamentals (indices 0–6) + # fundamentals (valuation) "fcf_yield", "trailing_pe", "forward_pe", "ev_ebitda", - "revenue_cagr_3y", "gross_margin", "net_debt", - # technical (indices 7–12) + + # fundamentals (quality) + "revenue_cagr_3y", "gross_margin", "net_debt_to_ebita", # to do: add to fundamentals.py + + # technical "rsi_signal", "macd_signal", "breakout_score", "volume_surge", - "close", "volume", - # macro (indices 13–17) - "fed_funds_rate", "cpi_yoy", "unemployment", "treasury_10y", "yield_spread", - # options (indices 18–20) + + # momentum + "return_21d", "return_63d", "return_126d", "market_relative_strength_63d", + + #market risk / liquidity + "realized_volatility_21d", "beta_126d", "max_drawdown_63d", "dollar_volume_20d_log", + + # macro - pending the FRED ingestion (macro.py still empty) + "fed_funds_rate", "fed_funds_change_3m", "cpi_yoy", "cpi_trend_3m", "unemployment", "unemployment_change_3m", + "treasury_10y", "real_treasury_10y", "yield_spread", + + # options "put_call_ratio", "unusual_options_score", "gamma_exposure", - # sentiment (indices 21–25) - "mgmt_sentiment_score", "qa_sentiment_score", "news_sentiment", + + # insider "insider_buy_score", "net_insider_delta", + + # sentiment + "mgmt_sentiment_score", "qa_sentiment_score", "mgmt_qa_sentiment_gap", "transcript_sentiment_change_qoq", + + #analyst / sells-side + "price_target_upside", "price_target_dispersion", "recommendation_score", "recent_rating_delta", "coverage_volume", ] +MODALITY_FEATURES: dict[str, list[int]] = { + "valuation": ["fcf_yield", "trailing_pe", "forward_pe", "ev_ebita"], + "quality": ["revenue_cagr_3y", "gross_margin", "net_debt_to_ebita"], + "technical": ["rsi_signal", "macd_signal", "breakout_score", "volume_surge"], + "momentum": [ "return_21d", "return_63d", "return_126d", "market_relative_strength_63d"], + "market_risk": ["realized_volatility_21d", "beta_126d", "max_drawdown_63d", "dollar_volume_20d_log"], + "macro": ["fed_funds_rate", "fed_funds_change_3m", "cpi_yoy", "cpi_trend_3m", "unemployment", "unemployment_change_3m", + "treasury_10y", "real_treasury_10y", "yield_spread"], + "options": [ "put_call_ratio", "unusual_options_score", "gamma_exposure",], + "insider": ["insider_buy_score", "net_insider_delta"], + "transcript": ["mgmt_sentiment_score", "qa_sentiment_score", "mgmt_qa_sentiment_gap", "transcript_sentiment_change_qoq"], + "analyst": ["price_target_upside", "price_target_dispersion", "recommendation_score", "recent_rating_delta", "coverage_volume"] + +} +_FEATURE_INDEX: dict[str, int] = {name: i for i, name in enumerate(FEATURE_ORDER)} MODALITY_GROUPS: dict[str, list[int]] = { - "fundamentals": [0, 1, 2, 3, 4, 5, 6], - "technical": [7, 8, 9, 10, 11, 12], - "macro": [13, 14, 15, 16, 17], - "options": [18, 19, 20], - "sentiment": [21, 22, 23, 24, 25], + modality: [_FEATURE_INDEX[name] for name in names] + for modality, names in MODALITY_FEATURES.items() } - PERSONA_MODALITIES: dict[str, list[str]] = { - "value_fundamentalist": ["fundamentals"], - "growth_visionary": ["fundamentals"], - "quant_momentum": ["technical"], - "technical_analyst": ["technical"], - "macro_topdown": ["macro", "fundamentals"], - "options_flow_trader": ["options", "technical"], - "sentiment_trader": ["sentiment"], - "insider_institutional": ["sentiment", "fundamentals"], - "dividend_income": ["fundamentals"], + "value_fundamentalist": ["valuation", "quality", "insider", "transcript"], + "growth_visionary": ["valuation", "quality", "analyst", "transcript"], + "quant_momentum": ["momemntum", "market_risk", "options"], + "technical_analyst": ["technical", "momentum", "market_risk"], + "macro_topdown": ["macro", "valuation", "momentum"], + "options_flow_trader": ["options", "technical", "momentum"], + "sentiment_trader": ["transcript", "analyst", "options", "momentum"], + "insider_institutional": ["insider", "valuation", "quality", "transcript"], + "dividend_income": ["valuation", "quality", "macro", "transcript"], } N_FEATURES = len(FEATURE_ORDER) -HORIZONS = [21, 63, 126, 252] # trading days: 1M, 3M, 6M, 12M +HORIZONS = [10, 21, 63, 126, 252] # trading days: 2W, 1M, 3M, 6M, 12M + +#TO DO - update personal yamls \ No newline at end of file diff --git a/backend/app/ml/text_tokenizer.py b/backend/app/ml/text_tokenizer.py new file mode 100644 index 0000000..1abb6a7 --- /dev/null +++ b/backend/app/ml/text_tokenizer.py @@ -0,0 +1,105 @@ +# FinBERT text tokenization for sentiment scoring +# NOTE: This module tokenizes transcript *text*: it chunks paragraphs +# to fit FinBERT's 512-token window. + +#TO-DO: still need to run FinBERT on chunks to get scores for dataset - Kiana + +from __future__ import annotations +import re +from typing import Any + +FINBERT_MODEL = "ProsusAI/finbert" +MAX_TOKENS = 450 + +# Prefer to break a chunk on a sentence boundary so a split never lands mid-clause when avoidable. +_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+") + +_tokenizer = None + + +def get_tokenizer(): + """Lazily load and cache the FinBERT WordPiece tokenizer. + """ + global _tokenizer + if _tokenizer is None: + from transformers import AutoTokenizer + + _tokenizer = AutoTokenizer.from_pretrained(FINBERT_MODEL) + return _tokenizer + + +def _count_tokens(text: str, tokenizer) -> int: + return len(tokenizer.encode(text, add_special_tokens=False)) + + +def _split_long_sentence(sentence: str, tokenizer, max_tokens: int) -> list[tuple[str, int]]: + """Hard-split a single sentence that on its own exceeds max_tokens, on token windows.""" + ids = tokenizer.encode(sentence, add_special_tokens=False) + chunks: list[tuple[str, int]] = [] + for start in range(0, len(ids), max_tokens): + window = ids[start:start + max_tokens] + chunks.append((tokenizer.decode(window).strip(), len(window))) + return chunks + + +def chunk_text(text: str, tokenizer=None, max_tokens: int = MAX_TOKENS) -> list[tuple[str, int]]: + """Split text into chunks of at most max_tokens FinBERT tokens, preferring sentence breaks. + + Returns (chunk_text, token_count) pairs. + """ + if tokenizer is None: + tokenizer = get_tokenizer() + + total = _count_tokens(text, tokenizer) + if total <= max_tokens: + return [(text, total)] + + chunks: list[tuple[str, int]] = [] + current: list[str] = [] + current_tokens = 0 + for sentence in _SENTENCE_RE.split(text): + sentence = sentence.strip() + if not sentence: + continue + n = _count_tokens(sentence, tokenizer) + if n > max_tokens: + # A single sentence larger than the window: flush what we have, then window-split it. + if current: + chunks.append((" ".join(current), current_tokens)) + current, current_tokens = [], 0 + chunks.extend(_split_long_sentence(sentence, tokenizer, max_tokens)) + continue + if current_tokens + n > max_tokens: + chunks.append((" ".join(current), current_tokens)) + current, current_tokens = [sentence], n + else: + current.append(sentence) + current_tokens += n + if current: + chunks.append((" ".join(current), current_tokens)) + return chunks + + +def chunk_segments(segments: list[dict[str, Any]], tokenizer=None) -> list[dict[str, Any]]: + """Expand transcript segments into FinBERT-sized scoring chunks. + + Each input segment (one paragraph) yields one or more chunk dicts that carry the paragraph's identity and tags (paragraph, role, speaker, + speaker_type, qa_exchange) plus the chunk's `chunk` index within the paragraph, its `token_count`, and `text`. + """ + if tokenizer is None: + tokenizer = get_tokenizer() + + chunks: list[dict[str, Any]] = [] + for seg in segments: + for chunk_index, (chunk, token_count) in enumerate(chunk_text(seg["text"], tokenizer)): + chunks.append({ + "paragraph": seg["paragraph"], + "chunk": chunk_index, + "role": seg["role"], + "speaker": seg["speaker"], + "speaker_type": seg["speaker_type"], + "qa_exchange": seg["qa_exchange"], + "token_count": token_count, + "text": chunk, + }) + return chunks diff --git a/backend/tests/test_text_tokenizer.py b/backend/tests/test_text_tokenizer.py new file mode 100644 index 0000000..c3f1c29 --- /dev/null +++ b/backend/tests/test_text_tokenizer.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import pytest + +from app.ml import text_tokenizer +from app.ml.text_tokenizer import MAX_TOKENS, chunk_segments, chunk_text + + +class _FakeTokenizer: + """Whitespace stand-in for the FinBERT WordPiece tokenizer. + """ + + def encode(self, text: str, add_special_tokens: bool = False) -> list[str]: + return text.split() + + def decode(self, ids: list[str]) -> str: + return " ".join(ids) + + +@pytest.fixture(autouse=True) +def _fake_tokenizer(monkeypatch): + monkeypatch.setattr(text_tokenizer, "get_tokenizer", lambda: _FakeTokenizer()) + + +# chunk_text +def test_chunk_text_short_text_is_single_chunk(): + chunks = chunk_text("We had a strong quarter with record revenue.") + assert len(chunks) == 1 + text, token_count = chunks[0] + assert text == "We had a strong quarter with record revenue." + assert token_count == 8 + + +def test_chunk_text_window_splits_long_unbroken_text(): + # No sentence breaks, longer than the window -> window-split into ordered chunks, nothing lost. + long_text = " ".join(["word"] * (MAX_TOKENS * 2 + 30)) + chunks = chunk_text(long_text) + assert len(chunks) == 3 + assert all(token_count <= MAX_TOKENS for _, token_count in chunks) + assert " ".join(text for text, _ in chunks) == long_text + + +def test_chunk_text_packs_sentences_under_the_limit(): + sentence = " ".join(["alpha"] * 90) + "." + paragraph = " ".join([sentence] * 8) # 8 * ~91 tokens ~ 728 > MAX_TOKENS + chunks = chunk_text(paragraph) + assert len(chunks) >= 2 + assert all(token_count <= MAX_TOKENS for _, token_count in chunks) + + +def test_chunk_text_preserves_raw_text_verbatim(): + text = "Margins? We DON'T disclose that... yet." + chunks = chunk_text(text) + assert chunks == [(text, len(text.split()))] + + +def test_chunk_text_accepts_injected_tokenizer(): + chunks = chunk_text("one two three", tokenizer=_FakeTokenizer()) + assert chunks == [("one two three", 3)] + + +# chunk_segments +def _segment(paragraph: int, text: str, **overrides) -> dict: + seg = { + "paragraph": paragraph, + "role": "qa", + "speaker": "Jane CEO", + "speaker_type": "executive", + "qa_exchange": 1, + "text": text, + } + seg.update(overrides) + return seg + + +def test_chunk_segments_carries_tags_and_indexes_chunks(): + long_text = " ".join(["word"] * (MAX_TOKENS + 5)) + segments = [ + _segment(1, "Short answer.", qa_exchange=1), + _segment(2, long_text, qa_exchange=2), + ] + chunks = chunk_segments(segments) + + # Paragraph 1 stays one chunk; paragraph 2 splits into two. + by_para = {} + for c in chunks: + by_para.setdefault(c["paragraph"], []).append(c) + assert [c["chunk"] for c in by_para[1]] == [0] + assert [c["chunk"] for c in by_para[2]] == [0, 1] + + # Tags from the source segment ride along onto every chunk. + assert by_para[2][0]["qa_exchange"] == 2 + assert by_para[2][0]["speaker_type"] == "executive" + assert all(c["token_count"] <= MAX_TOKENS for c in chunks) + assert set(chunks[0].keys()) == { + "paragraph", "chunk", "role", "speaker", "speaker_type", "qa_exchange", "token_count", "text", + } diff --git a/backend/tests/test_transcripts.py b/backend/tests/test_transcripts.py index 942fa89..ff75e51 100644 --- a/backend/tests/test_transcripts.py +++ b/backend/tests/test_transcripts.py @@ -130,32 +130,18 @@ def test_split_segments_pairs_multiple_qa_exchanges(): # build_record def test_build_record_contract(): - rec = build_record("AAPL", 2024, 1, "2024-02-01", _transcript_df(_CALL), "tid-123") - assert set(rec.keys()) == { - "ticker", "quarter", "source", "source_transcript_id", "report_date", "segments", - } + rec = build_record("AAPL", 2024, 1, _transcript_df(_CALL)) + assert set(rec.keys()) == {"ticker", "quarter", "source", "segments"} assert rec["ticker"] == "AAPL" assert rec["quarter"] == "2024Q1" assert rec["source"] == "defeatbeta" - assert rec["source_transcript_id"] == "tid-123" - assert rec["report_date"] == "2024-02-01" assert len(rec["segments"]) == len(_CALL) assert {s["role"] for s in rec["segments"]} <= {"management", "qa"} -def test_build_record_coerces_transcript_id_to_str(): - rec = build_record("AAPL", 2024, 1, "2024-02-01", _transcript_df(_CALL), 4567) - assert rec["source_transcript_id"] == "4567" - - -def test_build_record_allows_missing_transcript_id(): - rec = build_record("AAPL", 2024, 1, "2024-02-01", _transcript_df(_CALL)) - assert rec["source_transcript_id"] is None - - def test_build_record_rejects_bad_quarter(): with pytest.raises(ValueError, match="fiscal_quarter"): - build_record("AAPL", 2024, 5, "2024-02-01", _transcript_df(_CALL)) + build_record("AAPL", 2024, 5, _transcript_df(_CALL)) #ingest @@ -164,8 +150,6 @@ def test_ingest_filters_by_min_year_and_builds_records(monkeypatch): "symbol": ["AAPL", "AAPL", "AAPL"], "fiscal_year": [MIN_FISCAL_YEAR - 1, MIN_FISCAL_YEAR, MIN_FISCAL_YEAR + 1], "fiscal_quarter": [4, 1, 2], - "report_date": ["2014-01-27", "2015-01-27", "2016-04-26"], - "transcripts_id": ["t-old", "t-2015q1", "t-2016q2"], }) monkeypatch.setattr(transcripts, "fetch_transcript_list", lambda t: listing) monkeypatch.setattr(transcripts, "fetch_transcript", lambda t, y, q: _transcript_df(_CALL)) @@ -175,4 +159,3 @@ def test_ingest_filters_by_min_year_and_builds_records(monkeypatch): assert [r["quarter"] for r in records] == [f"{MIN_FISCAL_YEAR}Q1", f"{MIN_FISCAL_YEAR + 1}Q2"] assert all(r["ticker"] == "AAPL" for r in records) assert all(r["segments"] for r in records) - assert [r["source_transcript_id"] for r in records] == ["t-2015q1", "t-2016q2"]