Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions backend/tests/unit/test_tools_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,15 @@ def test_with_conversations(self):
class TestSearchConversationsText:
def setup_method(self):
vector_db.query_vectors.reset_mock()
vector_db.query_vectors.side_effect = None
vector_db.query_vectors.return_value = []

conversations_db.get_conversations.reset_mock()
conversations_db.get_conversations.side_effect = None
conversations_db.get_conversations.return_value = []

conversations_db.get_conversations_by_id.reset_mock()
conversations_db.get_conversations_by_id.side_effect = None
conversations_db.get_conversations_by_id.return_value = []

def test_no_results(self):
Expand Down Expand Up @@ -389,6 +396,97 @@ def test_end_date_only_sets_starts_at(self):
assert call_kwargs[1]['starts_at'] == 0
assert call_kwargs[1]['ends_at'] is not None

def test_lexical_rank_prioritizes_title_and_overview_matches(self):
conversations = [
{
"id": "conv-transcript",
"structured": {"title": "Random catchup", "overview": "General discussion"},
"transcript_segments": [{"text": "We briefly mentioned ERPNext once."}],
"is_locked": False,
},
{
"id": "conv-title",
"structured": {"title": "ERPNext CRM automation", "overview": "Lead pipeline work"},
"transcript_segments": [],
"is_locked": False,
},
]

ranked_ids = conversations_svc._rank_conversations_lexically("ERPNext", conversations, limit=2)

assert ranked_ids == ["conv-title", "conv-transcript"]

def test_search_uses_lexical_fallback_when_vector_returns_empty(self):
vector_db.query_vectors.return_value = []

fallback_candidates = [
{
"id": "conv-erp",
"structured": {"title": "CRM Automation", "overview": "Discussed ERPNext lead pipeline"},
"transcript_segments": [],
"is_locked": False,
}
]
conversations_db.get_conversations.return_value = fallback_candidates
conversations_db.get_conversations_by_id.return_value = fallback_candidates

result = conversations_svc.search_conversations_text(
uid="uid-1",
query="ERPNext",
limit=5,
include_transcript=False,
)

assert "Found 1 conversations matching 'ERPNext' via lexical retrieval" in result
conversations_db.get_conversations.assert_called_once()
conversations_db.get_conversations_by_id.assert_called_once_with("uid-1", ["conv-erp"])

def test_search_lexical_fallback_matches_transcript_text(self):
vector_db.query_vectors.return_value = []

fallback_candidates = [
{
"id": "conv-iim",
"structured": {"title": "Research notes", "overview": "Chatbot evaluation"},
"transcript_segments": [{"text": "Today we discussed IIM Ranchi and intent classification."}],
"is_locked": False,
}
]
conversations_db.get_conversations.return_value = fallback_candidates
conversations_db.get_conversations_by_id.return_value = fallback_candidates

result = conversations_svc.search_conversations_text(
uid="uid-1",
query="IIM Ranchi",
limit=5,
include_transcript=False,
)

assert "Found 1 conversations matching 'IIM Ranchi' via lexical retrieval" in result
conversations_db.get_conversations_by_id.assert_called_once_with("uid-1", ["conv-iim"])

def test_search_lexical_fallback_filters_locked_candidates(self):
vector_db.query_vectors.return_value = []

conversations_db.get_conversations.return_value = [
{
"id": "conv-locked",
"structured": {"title": "ERPNext", "overview": "ERPNext implementation details"},
"transcript_segments": [],
"is_locked": True,
}
]

result = conversations_svc.search_conversations_text(
uid="uid-1",
query="ERPNext",
limit=5,
include_transcript=False,
)

assert "No conversations found matching 'ERPNext'" in result
conversations_db.get_conversations_by_id.assert_not_called()


# ===========================================================================
# Tests: get_memories_text
Expand Down
148 changes: 141 additions & 7 deletions backend/utils/retrieval/tool_services/conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
Used by both LangChain tools (mobile chat) and REST router (desktop/web).
"""

import logging
import re
from collections import Counter
from datetime import datetime, timezone
from typing import List, Optional

Expand All @@ -14,10 +16,11 @@
from models.other import Person
from utils.conversations.factory import deserialize_conversation
from utils.conversations.render import conversations_to_string
import logging

logger = logging.getLogger(__name__)

LEXICAL_FALLBACK_CANDIDATE_LIMIT = 200
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Silent recall cap for users with large conversation histories

LEXICAL_FALLBACK_CANDIDATE_LIMIT = 200 means the fallback searches only the 200 most-recently-created conversations. A user who had a relevant conversation #201 or older will silently get "No conversations found" even though lexical scoring would have ranked it. Consider logging when len(candidate_conversations) == LEXICAL_FALLBACK_CANDIDATE_LIMIT to make this ceiling observable in production.



def parse_iso_date(date_str: str, param_name: str) -> datetime:
"""Parse ISO date string with timezone. Raises ValueError on bad format."""
Expand All @@ -32,6 +35,100 @@ def parse_iso_date(date_str: str, param_name: str) -> datetime:
return dt


def _tokenize_for_lexical_search(text: str) -> List[str]:
"""Tokenize text for lightweight exact-term retrieval."""
if not text:
return []
return re.findall(r"[a-z0-9]+", text.lower())


def _conversation_structured_text(conv_data: dict) -> tuple[str, str]:
"""Return title and overview text from dict-shaped structured data."""
structured = conv_data.get('structured') or {}
if not isinstance(structured, dict):
return '', ''
return structured.get('title') or '', structured.get('overview') or ''


def _conversation_transcript_text(conv_data: dict) -> str:
"""Return transcript text from transcript segment dictionaries."""
segments = conv_data.get('transcript_segments') or []
texts = []
for segment in segments:
if isinstance(segment, dict):
text = segment.get('text') or ''
if text:
texts.append(text)
return ' '.join(texts)


def _score_conversation_lexically(query: str, conv_data: dict) -> float:
"""Score one conversation using a small BM25-inspired exact-token heuristic.

This intentionally avoids dependencies. It is meant as a fallback for
names, acronyms, products, tools, and other exact strings that vector search
can miss.
"""
query_tokens = _tokenize_for_lexical_search(query)
if not query_tokens:
return 0.0

title, overview = _conversation_structured_text(conv_data)
transcript = _conversation_transcript_text(conv_data)

title_tokens = Counter(_tokenize_for_lexical_search(title))
overview_tokens = Counter(_tokenize_for_lexical_search(overview))
transcript_tokens = Counter(_tokenize_for_lexical_search(transcript))

score = 0.0
for token in query_tokens:
score += title_tokens[token] * 6.0
score += overview_tokens[token] * 4.0
score += transcript_tokens[token] * 1.5

query_lc = query.lower().strip()
title_lc = title.lower()
overview_lc = overview.lower()
transcript_lc = transcript.lower()

# Phrase matches are especially useful for names like "IIM Ranchi" or
# products like "ERPNext CRM".
if query_lc:
if query_lc in title_lc:
score += 12.0
if query_lc in overview_lc:
score += 8.0
if query_lc in transcript_lc:
score += 4.0

# Reward covering more unique query terms so one repeated token does not win.
unique_query_tokens = set(query_tokens)
all_tokens = set(title_tokens) | set(overview_tokens) | set(transcript_tokens)
covered = len(unique_query_tokens & all_tokens)
score += covered * 2.0

return score


def _rank_conversations_lexically(query: str, conversations_data: List[dict], limit: int) -> List[str]:
"""Return conversation IDs ranked by lexical score."""
scored = []
for conv_data in conversations_data:
if conv_data.get('is_locked', False):
continue

conv_id = conv_data.get('id')
if not conv_id:
continue

score = _score_conversation_lexically(query, conv_data)
if score > 0:
scored.append((score, conv_id))

scored.sort(key=lambda item: item[0], reverse=True)
return [conv_id for _, conv_id in scored[:limit]]


def get_conversations_text(
uid: str,
start_date: Optional[str] = None,
Expand Down Expand Up @@ -142,7 +239,7 @@ def search_conversations_text(
include_transcript: bool = True,
include_timestamps: bool = False,
) -> str:
"""Semantic vector search for conversations, formatted as LLM-ready text."""
"""Hybrid conversation search with vector retrieval and lexical fallback."""
logger.info(f"search_conversations_text - uid: {uid}, query: {query}, limit: {limit}")

# Cap limits
Expand All @@ -153,16 +250,18 @@ def search_conversations_text(
# Parse date filters to timestamps
starts_at = None
ends_at = None
start_dt = None
end_dt = None
if start_date:
try:
dt = parse_iso_date(start_date, 'start_date')
starts_at = int(dt.timestamp())
start_dt = parse_iso_date(start_date, 'start_date')
starts_at = int(start_dt.timestamp())
except ValueError as e:
return f"Error: Invalid start_date format: {e}"
if end_date:
try:
dt = parse_iso_date(end_date, 'end_date')
ends_at = int(dt.timestamp())
end_dt = parse_iso_date(end_date, 'end_date')
ends_at = int(end_dt.timestamp())
except ValueError as e:
return f"Error: Invalid end_date format: {e}"

Expand All @@ -174,8 +273,39 @@ def search_conversations_text(
starts_at = 0 # epoch

try:
# Existing vector-only implementation:
# conversation_ids = vector_db.query_vectors(query=query, uid=uid, starts_at=starts_at, ends_at=ends_at, k=limit)
#
# if not conversation_ids:
# date_info = ""
# if starts_at and ends_at:
# date_info = " in the specified date range"
# elif starts_at:
# date_info = " after the specified start date"
# elif ends_at:
# date_info = " before the specified end date"
# return f"No conversations found matching '{query}'{date_info}."
#
# conversations_data = conversations_db.get_conversations_by_id(uid, conversation_ids)
Comment on lines +276 to +289
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Dead commented-out code block

The old vector-only implementation is left as a 14-line comment block. This is pure noise — the git history already preserves the original logic. The comment also risks confusion: a future reader may assume the block is meant to be re-enabled or may accidentally un-comment it.


conversation_ids = vector_db.query_vectors(query=query, uid=uid, starts_at=starts_at, ends_at=ends_at, k=limit)

retrieval_mode = "semantic"
if not conversation_ids:
logger.info("search_conversations_text - vector search returned no results, trying lexical fallback")

candidate_conversations = conversations_db.get_conversations(
uid,
limit=LEXICAL_FALLBACK_CANDIDATE_LIMIT,
offset=0,
start_date=start_dt,
end_date=end_dt,
include_discarded=False,
statuses=["processing", "completed"],
)
conversation_ids = _rank_conversations_lexically(query, candidate_conversations, limit)
retrieval_mode = "lexical"

if not conversation_ids:
date_info = ""
if starts_at and ends_at:
Expand All @@ -195,6 +325,10 @@ def search_conversations_text(
if not conversations_data:
return f"No conversations found matching query: '{query}'"

# Preserve retrieval ranking after Firestore fetch.
order = {conversation_id: idx for idx, conversation_id in enumerate(conversation_ids)}
conversations_data.sort(key=lambda c: order.get(c.get('id'), len(order)))

# Load people
people = []
if include_transcript:
Expand Down Expand Up @@ -222,7 +356,7 @@ def search_conversations_text(
logger.error(f"Error parsing conversation {conv_data.get('id')}: {e}")
continue

result = f"Found {len(conversations)} conversations semantically matching '{query}':\n\n"
result = f"Found {len(conversations)} conversations matching '{query}' via {retrieval_mode} retrieval:\n\n"
result += conversations_to_string(
conversations, use_transcript=include_transcript, include_timestamps=include_timestamps, people=people
)
Expand Down