From 232fee88c40fc50135799990e62c8e0d3d3e3ae5 Mon Sep 17 00:00:00 2001 From: "MSI\\hupeky" Date: Mon, 2 Feb 2026 15:45:48 +0700 Subject: [PATCH] fix: improve Cypher query generation accuracy This PR addresses issues where the LLM generates incorrect Cypher queries due to misunderstanding the graph schema. Changes: - Add CYPHER_EXAMPLE_CLASS_METHODS to demonstrate DEFINES_METHOD pattern - Add VALUE PATTERN RULES to prompts explaining name vs qualified_name usage - Improve _clean_cypher_response() to handle markdown formatting in LLM output The prompt improvements teach the LLM to: - Use `name` property for short class/function names (not qualified_name) - Use correct relationships (DEFINES_METHOD, DEFINES) - Follow proper Cypher patterns for this schema The response cleaner now handles: - Triple backtick code blocks (```cypher ... ```) - Bold markdown headers (**Cypher Query:**) - Mixed formatting in LLM responses Co-Authored-By: Claude Opus 4.5 --- codebase_rag/cypher_queries.py | 4 ++-- codebase_rag/prompts.py | 10 +++++++++- codebase_rag/services/llm.py | 27 ++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/codebase_rag/cypher_queries.py b/codebase_rag/cypher_queries.py index 8d70bae4e..82e007bbb 100644 --- a/codebase_rag/cypher_queries.py +++ b/codebase_rag/cypher_queries.py @@ -52,8 +52,8 @@ CYPHER_EXAMPLE_LIMIT_ONE = """MATCH (f:File) RETURN f.path as path, f.name as name, labels(f) as type LIMIT 1""" CYPHER_EXAMPLE_CLASS_METHODS = f"""MATCH (c:Class)-[:DEFINES_METHOD]->(m:Method) -WHERE c.qualified_name ENDS WITH '.UserService' -RETURN m.name AS name, m.qualified_name AS qualified_name, labels(m) AS type +WHERE c.name = 'UserService' +RETURN c.name AS className, m.name AS methodName, m.qualified_name AS qualified_name, labels(m) AS type LIMIT {CYPHER_DEFAULT_LIMIT}""" CYPHER_EXPORT_NODES = """ diff --git a/codebase_rag/prompts.py b/codebase_rag/prompts.py index de5cce132..48bbe8d4b 100644 --- a/codebase_rag/prompts.py +++ b/codebase_rag/prompts.py @@ -196,6 +196,14 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: - CORRECT: `MATCH (c:Class) RETURN count(c) AS total` - WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!) +**VALUE PATTERN RULES (CRITICAL FOR NAME MATCHING):** +- The `qualified_name` property contains FULL paths like: `'Project.folder.subfolder.ClassName'` +- When users mention a class or function by SHORT NAME (e.g., "VatManager", "UserService"), you MUST match using the `name` property, NOT `qualified_name`. +- CORRECT: `WHERE c.name = 'VatManager'` +- WRONG: `WHERE c.qualified_name = 'VatManager'` (will never match!) +- Use `DEFINES_METHOD` relationship to find methods of a class. +- Use `DEFINES` relationship to find functions/classes defined in a module. + **Examples:** * **Natural Language:** "How many classes are there?" @@ -235,7 +243,7 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: ``` * **Natural Language:** "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods" -* **Cypher Query (Use ENDS WITH to match class by short name):** +* **Cypher Query (Note: match by `name` property, use `DEFINES_METHOD` relationship):** ```cypher {CYPHER_EXAMPLE_CLASS_METHODS} ``` diff --git a/codebase_rag/services/llm.py b/codebase_rag/services/llm.py index 018ccc1af..0ab738eae 100644 --- a/codebase_rag/services/llm.py +++ b/codebase_rag/services/llm.py @@ -26,9 +26,30 @@ def _create_provider_model(config: ModelConfig) -> Model: def _clean_cypher_response(response_text: str) -> str: - query = response_text.strip().replace(cs.CYPHER_BACKTICK, "") - if query.startswith(cs.CYPHER_PREFIX): - query = query[len(cs.CYPHER_PREFIX) :].strip() + """Clean LLM response to extract pure Cypher query. + + Handles markdown formatting that models sometimes output: + - Triple backticks (```cypher ... ```) + - Bold text (**Cypher Query:**) + - Headers and other markdown + """ + import re + + query = response_text.strip() + + # Extract content from code blocks if present (```cypher ... ``` or ``` ... ```) + code_block_match = re.search(r"```(?:cypher)?\s*(.*?)```", query, re.DOTALL | re.IGNORECASE) + if code_block_match: + query = code_block_match.group(1).strip() + else: + # Remove markdown bold/headers (e.g., **Cypher Query:**) + query = re.sub(r"\*\*[^*]+\*\*:?\s*", "", query) + # Remove single backticks + query = query.replace(cs.CYPHER_BACKTICK, "") + # Remove "cypher" prefix if present + if query.lower().startswith(cs.CYPHER_PREFIX): + query = query[len(cs.CYPHER_PREFIX):].strip() + if not query.endswith(cs.CYPHER_SEMICOLON): query += cs.CYPHER_SEMICOLON return query