From f9927c36e4e8f4cefcd9c896a2133aed214f578a Mon Sep 17 00:00:00 2001 From: Ratnam Ojha Date: Thu, 21 May 2026 09:14:29 +0530 Subject: [PATCH 1/2] Add hybrid semantic + lexical conversation retrieval --- backend/tests/unit/test_tools_router.py | 97 ++++++++++++ .../retrieval/tool_services/conversations.py | 148 +++++++++++++++++- 2 files changed, 238 insertions(+), 7 deletions(-) diff --git a/backend/tests/unit/test_tools_router.py b/backend/tests/unit/test_tools_router.py index 504c48666d5..c23b3f028a7 100644 --- a/backend/tests/unit/test_tools_router.py +++ b/backend/tests/unit/test_tools_router.py @@ -353,8 +353,15 @@ def test_with_conversations(self): class TestSearchConversationsText: def setup_method(self): vector_db.query_vectors.reset_mock() + vector_db.query_vectors.side_effect = None vector_db.query_vectors.return_value = [] + + conversations_db.get_conversations.reset_mock() + conversations_db.get_conversations.side_effect = None + conversations_db.get_conversations.return_value = [] + conversations_db.get_conversations_by_id.reset_mock() + conversations_db.get_conversations_by_id.side_effect = None conversations_db.get_conversations_by_id.return_value = [] def test_no_results(self): @@ -389,6 +396,96 @@ def test_end_date_only_sets_starts_at(self): assert call_kwargs[1]['starts_at'] == 0 assert call_kwargs[1]['ends_at'] is not None + def test_lexical_rank_prioritizes_title_and_overview_matches(self): + conversations = [ + { + "id": "conv-transcript", + "structured": {"title": "Random catchup", "overview": "General discussion"}, + "transcript_segments": [{"text": "We briefly mentioned ERPNext once."}], + "is_locked": False, + }, + { + "id": "conv-title", + "structured": {"title": "ERPNext CRM automation", "overview": "Lead pipeline work"}, + "transcript_segments": [], + "is_locked": False, + }, + ] + + ranked_ids = conversations_svc._rank_conversations_lexically("ERPNext", conversations, limit=2) + + assert ranked_ids == ["conv-title", "conv-transcript"] + + def test_search_uses_lexical_fallback_when_vector_returns_empty(self): + vector_db.query_vectors.return_value = [] + + fallback_candidates = [ + { + "id": "conv-erp", + "structured": {"title": "CRM Automation", "overview": "Discussed ERPNext lead pipeline"}, + "transcript_segments": [], + "is_locked": False, + } + ] + conversations_db.get_conversations.return_value = fallback_candidates + conversations_db.get_conversations_by_id.return_value = fallback_candidates + + result = conversations_svc.search_conversations_text( + uid="uid-1", + query="ERPNext", + limit=5, + include_transcript=False, + ) + + assert "Found 1 conversations matching 'ERPNext' via lexical retrieval" in result + conversations_db.get_conversations.assert_called_once() + conversations_db.get_conversations_by_id.assert_called_once_with("uid-1", ["conv-erp"]) + + def test_search_lexical_fallback_matches_transcript_text(self): + vector_db.query_vectors.return_value = [] + + fallback_candidates = [ + { + "id": "conv-iim", + "structured": {"title": "Research notes", "overview": "Chatbot evaluation"}, + "transcript_segments": [{"text": "Today we discussed IIM Ranchi and intent classification."}], + "is_locked": False, + } + ] + conversations_db.get_conversations.return_value = fallback_candidates + conversations_db.get_conversations_by_id.return_value = fallback_candidates + + result = conversations_svc.search_conversations_text( + uid="uid-1", + query="IIM Ranchi", + limit=5, + include_transcript=False, + ) + + assert "Found 1 conversations matching 'IIM Ranchi' via lexical retrieval" in result + conversations_db.get_conversations_by_id.assert_called_once_with("uid-1", ["conv-iim"]) + + def test_search_lexical_fallback_filters_locked_candidates(self): + vector_db.query_vectors.return_value = [] + + conversations_db.get_conversations.return_value = [ + { + "id": "conv-locked", + "structured": {"title": "ERPNext", "overview": "ERPNext implementation details"}, + "transcript_segments": [], + "is_locked": True, + } + ] + + result = conversations_svc.search_conversations_text( + uid="uid-1", + query="ERPNext", + limit=5, + include_transcript=False, + ) + + assert "No conversations found matching 'ERPNext'" in result + conversations_db.get_conversations_by_id.assert_not_called() # =========================================================================== # Tests: get_memories_text diff --git a/backend/utils/retrieval/tool_services/conversations.py b/backend/utils/retrieval/tool_services/conversations.py index 4e930ae3c5e..11bf093e8f1 100644 --- a/backend/utils/retrieval/tool_services/conversations.py +++ b/backend/utils/retrieval/tool_services/conversations.py @@ -3,7 +3,9 @@ Used by both LangChain tools (mobile chat) and REST router (desktop/web). """ +import logging import re +from collections import Counter from datetime import datetime, timezone from typing import List, Optional @@ -14,10 +16,11 @@ from models.other import Person from utils.conversations.factory import deserialize_conversation from utils.conversations.render import conversations_to_string -import logging logger = logging.getLogger(__name__) +LEXICAL_FALLBACK_CANDIDATE_LIMIT = 200 + def parse_iso_date(date_str: str, param_name: str) -> datetime: """Parse ISO date string with timezone. Raises ValueError on bad format.""" @@ -32,6 +35,100 @@ def parse_iso_date(date_str: str, param_name: str) -> datetime: return dt +def _tokenize_for_lexical_search(text: str) -> List[str]: + """Tokenize text for lightweight exact-term retrieval.""" + if not text: + return [] + return re.findall(r"[a-z0-9]+", text.lower()) + + +def _conversation_structured_text(conv_data: dict) -> tuple[str, str]: + """Return title and overview text from dict-shaped structured data.""" + structured = conv_data.get('structured') or {} + if not isinstance(structured, dict): + return '', '' + return structured.get('title') or '', structured.get('overview') or '' + + +def _conversation_transcript_text(conv_data: dict) -> str: + """Return transcript text from transcript segment dictionaries.""" + segments = conv_data.get('transcript_segments') or [] + texts = [] + for segment in segments: + if isinstance(segment, dict): + text = segment.get('text') or '' + if text: + texts.append(text) + return ' '.join(texts) + + +def _score_conversation_lexically(query: str, conv_data: dict) -> float: + """Score one conversation using a small BM25-inspired exact-token heuristic. + + This intentionally avoids dependencies. It is meant as a fallback for + names, acronyms, products, tools, and other exact strings that vector search + can miss. + """ + query_tokens = _tokenize_for_lexical_search(query) + if not query_tokens: + return 0.0 + + title, overview = _conversation_structured_text(conv_data) + transcript = _conversation_transcript_text(conv_data) + + title_tokens = Counter(_tokenize_for_lexical_search(title)) + overview_tokens = Counter(_tokenize_for_lexical_search(overview)) + transcript_tokens = Counter(_tokenize_for_lexical_search(transcript)) + + score = 0.0 + for token in query_tokens: + score += title_tokens[token] * 6.0 + score += overview_tokens[token] * 4.0 + score += transcript_tokens[token] * 1.5 + + query_lc = query.lower().strip() + title_lc = title.lower() + overview_lc = overview.lower() + transcript_lc = transcript.lower() + + # Phrase matches are especially useful for names like "IIM Ranchi" or + # products like "ERPNext CRM". + if query_lc: + if query_lc in title_lc: + score += 12.0 + if query_lc in overview_lc: + score += 8.0 + if query_lc in transcript_lc: + score += 4.0 + + # Reward covering more unique query terms so one repeated token does not win. + unique_query_tokens = set(query_tokens) + all_tokens = set(title_tokens) | set(overview_tokens) | set(transcript_tokens) + covered = len(unique_query_tokens & all_tokens) + score += covered * 2.0 + + return score + + +def _rank_conversations_lexically(query: str, conversations_data: List[dict], limit: int) -> List[str]: + """Return conversation IDs ranked by lexical score.""" + scored = [] + for conv_data in conversations_data: + if conv_data.get('is_locked', False): + continue + + conv_id = conv_data.get('id') + if not conv_id: + continue + + score = _score_conversation_lexically(query, conv_data) + if score > 0: + scored.append((score, conv_id)) + + scored.sort(key=lambda item: item[0], reverse=True) + return [conv_id for _, conv_id in scored[:limit]] + + def get_conversations_text( uid: str, start_date: Optional[str] = None, @@ -142,7 +239,7 @@ def search_conversations_text( include_transcript: bool = True, include_timestamps: bool = False, ) -> str: - """Semantic vector search for conversations, formatted as LLM-ready text.""" + """Hybrid conversation search with vector retrieval and lexical fallback.""" logger.info(f"search_conversations_text - uid: {uid}, query: {query}, limit: {limit}") # Cap limits @@ -153,16 +250,18 @@ def search_conversations_text( # Parse date filters to timestamps starts_at = None ends_at = None + start_dt = None + end_dt = None if start_date: try: - dt = parse_iso_date(start_date, 'start_date') - starts_at = int(dt.timestamp()) + start_dt = parse_iso_date(start_date, 'start_date') + starts_at = int(start_dt.timestamp()) except ValueError as e: return f"Error: Invalid start_date format: {e}" if end_date: try: - dt = parse_iso_date(end_date, 'end_date') - ends_at = int(dt.timestamp()) + end_dt = parse_iso_date(end_date, 'end_date') + ends_at = int(end_dt.timestamp()) except ValueError as e: return f"Error: Invalid end_date format: {e}" @@ -174,8 +273,39 @@ def search_conversations_text( starts_at = 0 # epoch try: + # Existing vector-only implementation: + # conversation_ids = vector_db.query_vectors(query=query, uid=uid, starts_at=starts_at, ends_at=ends_at, k=limit) + # + # if not conversation_ids: + # date_info = "" + # if starts_at and ends_at: + # date_info = " in the specified date range" + # elif starts_at: + # date_info = " after the specified start date" + # elif ends_at: + # date_info = " before the specified end date" + # return f"No conversations found matching '{query}'{date_info}." + # + # conversations_data = conversations_db.get_conversations_by_id(uid, conversation_ids) + conversation_ids = vector_db.query_vectors(query=query, uid=uid, starts_at=starts_at, ends_at=ends_at, k=limit) + retrieval_mode = "semantic" + if not conversation_ids: + logger.info("search_conversations_text - vector search returned no results, trying lexical fallback") + + candidate_conversations = conversations_db.get_conversations( + uid, + limit=LEXICAL_FALLBACK_CANDIDATE_LIMIT, + offset=0, + start_date=start_dt, + end_date=end_dt, + include_discarded=False, + statuses=["processing", "completed"], + ) + conversation_ids = _rank_conversations_lexically(query, candidate_conversations, limit) + retrieval_mode = "lexical" + if not conversation_ids: date_info = "" if starts_at and ends_at: @@ -195,6 +325,10 @@ def search_conversations_text( if not conversations_data: return f"No conversations found matching query: '{query}'" + # Preserve retrieval ranking after Firestore fetch. + order = {conversation_id: idx for idx, conversation_id in enumerate(conversation_ids)} + conversations_data.sort(key=lambda c: order.get(c.get('id'), len(order))) + # Load people people = [] if include_transcript: @@ -222,7 +356,7 @@ def search_conversations_text( logger.error(f"Error parsing conversation {conv_data.get('id')}: {e}") continue - result = f"Found {len(conversations)} conversations semantically matching '{query}':\n\n" + result = f"Found {len(conversations)} conversations matching '{query}' via {retrieval_mode} retrieval:\n\n" result += conversations_to_string( conversations, use_transcript=include_transcript, include_timestamps=include_timestamps, people=people ) From 87fda679dafefa43626a7785b2c975b02ea369ed Mon Sep 17 00:00:00 2001 From: Ratnam Ojha Date: Sun, 24 May 2026 00:55:35 +0530 Subject: [PATCH 2/2] Format tools router tests --- backend/tests/unit/test_tools_router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/tests/unit/test_tools_router.py b/backend/tests/unit/test_tools_router.py index c23b3f028a7..59bb3cbf711 100644 --- a/backend/tests/unit/test_tools_router.py +++ b/backend/tests/unit/test_tools_router.py @@ -487,6 +487,7 @@ def test_search_lexical_fallback_filters_locked_candidates(self): assert "No conversations found matching 'ERPNext'" in result conversations_db.get_conversations_by_id.assert_not_called() + # =========================================================================== # Tests: get_memories_text # ===========================================================================