Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 30 additions & 10 deletions backend/app/ingestion/transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
from typing import Any
import pandas as pd
from sqlalchemy.orm import Session

from .ticker_config import load_tickers
SOURCE = "defeatbeta"
Expand Down Expand Up @@ -44,6 +45,9 @@ def _classify_speaker_type(speaker: str, role: str, mgmt_speakers: set[str]) ->
# transform
def split_segments(df: pd.DataFrame) -> list[dict[str, Any]]:
"""Tag each transcript paragraph with its section, speaker identity, and Q&A exchange.

Segments are model-agnostic: one row per paragraph with raw text. chunking
is applied later in text_tokenizer
"""
missing = [c for c in _REQUIRED_COLUMNS if c not in df.columns]
if missing:
Expand Down Expand Up @@ -95,21 +99,16 @@ def build_record(
ticker: str,
fiscal_year: int,
fiscal_quarter: int,
report_date: Any,
transcript_df: pd.DataFrame,
source_transcript_id: Any = None,
) -> dict[str, Any]:
if not 1 <= int(fiscal_quarter) <= 4:
raise ValueError(f"fiscal_quarter must be 1-4, got {fiscal_quarter}")

segments = split_segments(transcript_df)
report = pd.to_datetime(report_date, errors="coerce")
return {
"ticker": ticker,
"quarter": f"{int(fiscal_year)}Q{int(fiscal_quarter)}",
"source": SOURCE,
"source_transcript_id": None if source_transcript_id is None else str(source_transcript_id),
"report_date": None if pd.isna(report) else report.date().isoformat(),
"segments": segments,
}

Expand Down Expand Up @@ -142,14 +141,35 @@ def ingest(ticker: str) -> list[dict[str, Any]]:
if year < MIN_FISCAL_YEAR:
continue
quarter = int(row["fiscal_quarter"])
report_date = row.get("report_date")
transcript_id = row.get("transcripts_id")
if transcript_id is not None and pd.isna(transcript_id):
transcript_id = None
df = fetch_transcript(ticker, year, quarter)
records.append(build_record(ticker, year, quarter, report_date, df, transcript_id))
records.append(build_record(ticker, year, quarter, df))
return records


def ingest_all() -> dict[str, list[dict[str, Any]]]:
return {ticker: ingest(ticker) for ticker in load_tickers()}


def upsert_transcripts(session: Session, records: list[dict[str, Any]]) -> int:
"""Upsert transcript records into the transcripts table keyed on (ticker, quarter)

Commits the session. Returns the number of rows upserted.
"""
if not records:
return 0

from sqlalchemy.dialects.postgresql import insert as pg_insert
from app.models.db import Transcript

stmt = pg_insert(Transcript).values(records)
stmt = stmt.on_conflict_do_update(
index_elements=["ticker", "quarter"],
set_={
"source": stmt.excluded.source,
"segments": stmt.excluded.segments,
"ingested_at": stmt.excluded.ingested_at,
},
)
session.execute(stmt)
session.commit()
return len(records)
80 changes: 55 additions & 25 deletions backend/app/ml/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,70 @@
# Do not reorder FEATURE_ORDER without updating MODALITY_GROUPS indices.

FEATURE_ORDER = [
# fundamentals (indices 0–6)
# fundamentals (valuation)
"fcf_yield", "trailing_pe", "forward_pe", "ev_ebitda",
"revenue_cagr_3y", "gross_margin", "net_debt",
# technical (indices 7–12)

# fundamentals (quality)
"revenue_cagr_3y", "gross_margin", "net_debt_to_ebita", # to do: add to fundamentals.py

# technical
"rsi_signal", "macd_signal", "breakout_score", "volume_surge",
"close", "volume",
# macro (indices 13–17)
"fed_funds_rate", "cpi_yoy", "unemployment", "treasury_10y", "yield_spread",
# options (indices 18–20)

# momentum
"return_21d", "return_63d", "return_126d", "market_relative_strength_63d",

#market risk / liquidity
"realized_volatility_21d", "beta_126d", "max_drawdown_63d", "dollar_volume_20d_log",

# macro - pending the FRED ingestion (macro.py still empty)
"fed_funds_rate", "fed_funds_change_3m", "cpi_yoy", "cpi_trend_3m", "unemployment", "unemployment_change_3m",
"treasury_10y", "real_treasury_10y", "yield_spread",

# options
"put_call_ratio", "unusual_options_score", "gamma_exposure",
# sentiment (indices 21–25)
"mgmt_sentiment_score", "qa_sentiment_score", "news_sentiment",

# insider
"insider_buy_score", "net_insider_delta",

# sentiment
"mgmt_sentiment_score", "qa_sentiment_score", "mgmt_qa_sentiment_gap", "transcript_sentiment_change_qoq",

#analyst / sells-side
"price_target_upside", "price_target_dispersion", "recommendation_score", "recent_rating_delta", "coverage_volume",
]

MODALITY_FEATURES: dict[str, list[int]] = {
"valuation": ["fcf_yield", "trailing_pe", "forward_pe", "ev_ebita"],
"quality": ["revenue_cagr_3y", "gross_margin", "net_debt_to_ebita"],
"technical": ["rsi_signal", "macd_signal", "breakout_score", "volume_surge"],
"momentum": [ "return_21d", "return_63d", "return_126d", "market_relative_strength_63d"],
"market_risk": ["realized_volatility_21d", "beta_126d", "max_drawdown_63d", "dollar_volume_20d_log"],
"macro": ["fed_funds_rate", "fed_funds_change_3m", "cpi_yoy", "cpi_trend_3m", "unemployment", "unemployment_change_3m",
"treasury_10y", "real_treasury_10y", "yield_spread"],
"options": [ "put_call_ratio", "unusual_options_score", "gamma_exposure",],
"insider": ["insider_buy_score", "net_insider_delta"],
"transcript": ["mgmt_sentiment_score", "qa_sentiment_score", "mgmt_qa_sentiment_gap", "transcript_sentiment_change_qoq"],
"analyst": ["price_target_upside", "price_target_dispersion", "recommendation_score", "recent_rating_delta", "coverage_volume"]

}
_FEATURE_INDEX: dict[str, int] = {name: i for i, name in enumerate(FEATURE_ORDER)}
MODALITY_GROUPS: dict[str, list[int]] = {
"fundamentals": [0, 1, 2, 3, 4, 5, 6],
"technical": [7, 8, 9, 10, 11, 12],
"macro": [13, 14, 15, 16, 17],
"options": [18, 19, 20],
"sentiment": [21, 22, 23, 24, 25],
modality: [_FEATURE_INDEX[name] for name in names]
for modality, names in MODALITY_FEATURES.items()
}

PERSONA_MODALITIES: dict[str, list[str]] = {
"value_fundamentalist": ["fundamentals"],
"growth_visionary": ["fundamentals"],
"quant_momentum": ["technical"],
"technical_analyst": ["technical"],
"macro_topdown": ["macro", "fundamentals"],
"options_flow_trader": ["options", "technical"],
"sentiment_trader": ["sentiment"],
"insider_institutional": ["sentiment", "fundamentals"],
"dividend_income": ["fundamentals"],
"value_fundamentalist": ["valuation", "quality", "insider", "transcript"],
"growth_visionary": ["valuation", "quality", "analyst", "transcript"],
"quant_momentum": ["momemntum", "market_risk", "options"],
"technical_analyst": ["technical", "momentum", "market_risk"],
"macro_topdown": ["macro", "valuation", "momentum"],
"options_flow_trader": ["options", "technical", "momentum"],
"sentiment_trader": ["transcript", "analyst", "options", "momentum"],
"insider_institutional": ["insider", "valuation", "quality", "transcript"],
"dividend_income": ["valuation", "quality", "macro", "transcript"],
}

N_FEATURES = len(FEATURE_ORDER)
HORIZONS = [21, 63, 126, 252] # trading days: 1M, 3M, 6M, 12M
HORIZONS = [10, 21, 63, 126, 252] # trading days: 2W, 1M, 3M, 6M, 12M

#TO DO - update personal yamls
105 changes: 105 additions & 0 deletions backend/app/ml/text_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# FinBERT text tokenization for sentiment scoring
# NOTE: This module tokenizes transcript *text*: it chunks paragraphs
# to fit FinBERT's 512-token window.

#TO-DO: still need to run FinBERT on chunks to get scores for dataset - Kiana

from __future__ import annotations
import re
from typing import Any

FINBERT_MODEL = "ProsusAI/finbert"
MAX_TOKENS = 450

# Prefer to break a chunk on a sentence boundary so a split never lands mid-clause when avoidable.
_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+")

_tokenizer = None


def get_tokenizer():
"""Lazily load and cache the FinBERT WordPiece tokenizer.
"""
global _tokenizer
if _tokenizer is None:
from transformers import AutoTokenizer

_tokenizer = AutoTokenizer.from_pretrained(FINBERT_MODEL)
return _tokenizer


def _count_tokens(text: str, tokenizer) -> int:
return len(tokenizer.encode(text, add_special_tokens=False))


def _split_long_sentence(sentence: str, tokenizer, max_tokens: int) -> list[tuple[str, int]]:
"""Hard-split a single sentence that on its own exceeds max_tokens, on token windows."""
ids = tokenizer.encode(sentence, add_special_tokens=False)
chunks: list[tuple[str, int]] = []
for start in range(0, len(ids), max_tokens):
window = ids[start:start + max_tokens]
chunks.append((tokenizer.decode(window).strip(), len(window)))
return chunks


def chunk_text(text: str, tokenizer=None, max_tokens: int = MAX_TOKENS) -> list[tuple[str, int]]:
"""Split text into chunks of at most max_tokens FinBERT tokens, preferring sentence breaks.

Returns (chunk_text, token_count) pairs.
"""
if tokenizer is None:
tokenizer = get_tokenizer()

total = _count_tokens(text, tokenizer)
if total <= max_tokens:
return [(text, total)]

chunks: list[tuple[str, int]] = []
current: list[str] = []
current_tokens = 0
for sentence in _SENTENCE_RE.split(text):
sentence = sentence.strip()
if not sentence:
continue
n = _count_tokens(sentence, tokenizer)
if n > max_tokens:
# A single sentence larger than the window: flush what we have, then window-split it.
if current:
chunks.append((" ".join(current), current_tokens))
current, current_tokens = [], 0
chunks.extend(_split_long_sentence(sentence, tokenizer, max_tokens))
continue
if current_tokens + n > max_tokens:
chunks.append((" ".join(current), current_tokens))
current, current_tokens = [sentence], n
else:
current.append(sentence)
current_tokens += n
if current:
chunks.append((" ".join(current), current_tokens))
return chunks


def chunk_segments(segments: list[dict[str, Any]], tokenizer=None) -> list[dict[str, Any]]:
"""Expand transcript segments into FinBERT-sized scoring chunks.

Each input segment (one paragraph) yields one or more chunk dicts that carry the paragraph's identity and tags (paragraph, role, speaker,
speaker_type, qa_exchange) plus the chunk's `chunk` index within the paragraph, its `token_count`, and `text`.
"""
if tokenizer is None:
tokenizer = get_tokenizer()

chunks: list[dict[str, Any]] = []
for seg in segments:
for chunk_index, (chunk, token_count) in enumerate(chunk_text(seg["text"], tokenizer)):
chunks.append({
"paragraph": seg["paragraph"],
"chunk": chunk_index,
"role": seg["role"],
"speaker": seg["speaker"],
"speaker_type": seg["speaker_type"],
"qa_exchange": seg["qa_exchange"],
"token_count": token_count,
"text": chunk,
})
return chunks
97 changes: 97 additions & 0 deletions backend/tests/test_text_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from __future__ import annotations

import pytest

from app.ml import text_tokenizer
from app.ml.text_tokenizer import MAX_TOKENS, chunk_segments, chunk_text


class _FakeTokenizer:
"""Whitespace stand-in for the FinBERT WordPiece tokenizer.
"""

def encode(self, text: str, add_special_tokens: bool = False) -> list[str]:
return text.split()

def decode(self, ids: list[str]) -> str:
return " ".join(ids)


@pytest.fixture(autouse=True)
def _fake_tokenizer(monkeypatch):
monkeypatch.setattr(text_tokenizer, "get_tokenizer", lambda: _FakeTokenizer())


# chunk_text
def test_chunk_text_short_text_is_single_chunk():
chunks = chunk_text("We had a strong quarter with record revenue.")
assert len(chunks) == 1
text, token_count = chunks[0]
assert text == "We had a strong quarter with record revenue."
assert token_count == 8


def test_chunk_text_window_splits_long_unbroken_text():
# No sentence breaks, longer than the window -> window-split into ordered chunks, nothing lost.
long_text = " ".join(["word"] * (MAX_TOKENS * 2 + 30))
chunks = chunk_text(long_text)
assert len(chunks) == 3
assert all(token_count <= MAX_TOKENS for _, token_count in chunks)
assert " ".join(text for text, _ in chunks) == long_text


def test_chunk_text_packs_sentences_under_the_limit():
sentence = " ".join(["alpha"] * 90) + "."
paragraph = " ".join([sentence] * 8) # 8 * ~91 tokens ~ 728 > MAX_TOKENS
chunks = chunk_text(paragraph)
assert len(chunks) >= 2
assert all(token_count <= MAX_TOKENS for _, token_count in chunks)


def test_chunk_text_preserves_raw_text_verbatim():
text = "Margins? We DON'T disclose that... yet."
chunks = chunk_text(text)
assert chunks == [(text, len(text.split()))]


def test_chunk_text_accepts_injected_tokenizer():
chunks = chunk_text("one two three", tokenizer=_FakeTokenizer())
assert chunks == [("one two three", 3)]


# chunk_segments
def _segment(paragraph: int, text: str, **overrides) -> dict:
seg = {
"paragraph": paragraph,
"role": "qa",
"speaker": "Jane CEO",
"speaker_type": "executive",
"qa_exchange": 1,
"text": text,
}
seg.update(overrides)
return seg


def test_chunk_segments_carries_tags_and_indexes_chunks():
long_text = " ".join(["word"] * (MAX_TOKENS + 5))
segments = [
_segment(1, "Short answer.", qa_exchange=1),
_segment(2, long_text, qa_exchange=2),
]
chunks = chunk_segments(segments)

# Paragraph 1 stays one chunk; paragraph 2 splits into two.
by_para = {}
for c in chunks:
by_para.setdefault(c["paragraph"], []).append(c)
assert [c["chunk"] for c in by_para[1]] == [0]
assert [c["chunk"] for c in by_para[2]] == [0, 1]

# Tags from the source segment ride along onto every chunk.
assert by_para[2][0]["qa_exchange"] == 2
assert by_para[2][0]["speaker_type"] == "executive"
assert all(c["token_count"] <= MAX_TOKENS for c in chunks)
assert set(chunks[0].keys()) == {
"paragraph", "chunk", "role", "speaker", "speaker_type", "qa_exchange", "token_count", "text",
}
Loading