diff --git a/DEBUG_REVIEW.md b/DEBUG_REVIEW.md new file mode 100644 index 0000000..d0b7085 --- /dev/null +++ b/DEBUG_REVIEW.md @@ -0,0 +1,200 @@ +# ReasonGraph Debug Review: "Available evidence does not sufficiently support a reliable answer." + +This document reviews the pipeline file-by-file to find why the verifier keeps rejecting answers and the system falls back to graceful failure. + +--- + +## Flow summary + +``` +main.py → initial_state(query) + → graph: START → query_optimizer → retriever → generator → verifier + → loop_controller: + - verifier_passed → END + - search_count < max_search → increment_search → query_optimizer (retry) + - else → graceful_failure → END +``` + +**When you see the generic message:** The graph ran the verifier, it failed (verifier_passed=False), and after `max_search` attempts (default 2, so up to 3 total runs) the loop controller sent the run to **graceful_failure**, which overwrites `draft_answer` with that message. So the root cause is **verifier failing on every attempt**. + +--- + +## 1. app/graph/state.py + +**Purpose:** Define `GraphState` and `initial_state()` so every node has a complete state (no missing keys). + +**Checks:** +- All fields are initialized in `initial_state()` — **OK** +- `search_count=0` and `max_search=2` — **OK** (gives up to 3 attempts: count 0→1→2, then graceful_failure) +- State is complete — **OK** + +**Potential bugs:** None. If `search_count` or `max_search` were ever missing, `loop_controller` would raise `KeyError`; we added `.get()` fallbacks in loop_controller and kept `initial_state` complete. + +**Connection:** Passed to graph; query_optimizer reads `user_query`, retriever reads `user_query` + `optimized_queries`, generator reads `reranked_docs`, verifier reads `draft_answer`, `citations`, `reranked_docs`, loop_controller reads `verifier_passed`, `search_count`, `max_search`. + +**Could cause graceful failure?** No. Missing/incomplete state could cause crashes, not systematic verifier failure. + +--- + +## 2. app/nodes/query_optimizer.py + +**Purpose:** Turn the user query into 3 optimized search queries for better recall. + +**Checks:** +- Produces up to 3 queries — **OK** (falls back to `[user_query]` on parse/API failure) +- No explicit handling of "BD" / "S&E" variations — **was a gap**. The document says "bd and s & e" (with spaces); if optimized queries stay as "BD and S&E", BM25/vector may not match as well. + +**Fix applied:** Prompt now asks to expand abbreviations (e.g. "business development", "s & e") in at least one query for recall. + +**Potential bugs:** +- On exception, returns `[user_query]` only (one query) — retrieval still runs. +- If the LLM returns a non-array or empty array, we fall back to single query. + +**Connection:** Writes `optimized_queries`. Retriever uses them for vector + BM25. + +**Could cause graceful failure?** Indirectly. Weak or single query → fewer/better candidates → generator may not see the right chunk → answer unsupported → verifier fails. + +--- + +## 3. app/nodes/retriever.py + +**Purpose:** Hybrid (vector + BM25) retrieval, then cross-encoder rerank; fill `reranked_docs` with top K chunks. + +**Checks:** +- Retrieves for each of `optimized_queries` — **OK** +- `RERANK_TOP_K` was 7 — **increased to 10** so more context reaches the generator +- Cross-encoder is used — **OK** (`pairs = [[content, user_query]]`, then sort by score) +- Chroma path is `./chroma_db` resolved from cwd — **OK** if run from ReasonGraph/; otherwise ensure cwd or use absolute path + +**Potential bugs:** +- If the chunk with the answer (e.g. "bd and s & e activities are limited to...") is ranked 8th–10th by the cross-encoder, it was previously dropped; with top 10 it’s more likely to be included. +- `collection.query(query_embeddings=[emb.tolist()])` — Chroma expects list of lists; we pass one embedding as `[emb.tolist()]` — **OK** + +**Connection:** Writes `reranked_docs`. Generator reads them and builds the evidence block. + +**Could cause graceful failure?** Yes. If the right chunk never appears in top K, the generator cannot cite it and may say "evidence does not provide" or cite a wrong chunk → verifier fails. + +--- + +## 4. app/nodes/generator.py + +**Purpose:** Produce `draft_answer` and `citations` from `reranked_docs` via LLM; citations use `chunk_ids` (list). + +**Checks:** +- Receives `reranked_docs` from state — **OK** +- Builds evidence block with `chunk_id` per doc — **OK** +- Prompt asks for JSON with `answer` and `citations` (claim + chunk_ids) — **OK** +- Parser normalizes to `chunk_ids` list only (no `chunk_id` in output) — **OK**; verifier supports `chunk_ids` +- Prompt now stresses: when evidence contains the answer (even different wording), answer and cite; do not say "evidence does not provide"; always include citations — **OK** + +**Potential bugs:** +- If the model returns invalid JSON or omits `citations`, we return `draft_answer` with `citations=[]` → verifier then fails (empty citations + non-failure answer). +- Chunk IDs must match exactly the `chunk_id` in the evidence block (e.g. `2602.15019v2_c1`). Typos or extra spaces → verifier sees MISSING_CHUNK → fails. + +**Connection:** Writes `draft_answer`, `citations`. Verifier reads both and `reranked_docs` to build chunk_map. + +**Could cause graceful failure?** Yes. Generator saying "evidence does not provide" or citing wrong/missing chunk_id → verifier rejects. + +--- + +## 5. app/nodes/verifier.py + +**Purpose:** Check every cited claim against chunk content; set `verifier_passed` and `unsupported_claims`. + +**Checks:** +- Receives `citations` (list of dicts with `claim` and `chunk_ids`) and `reranked_docs` — **OK** +- `_citation_chunk_ids(c)` supports both `chunk_id` (string) and `chunk_ids` (list) — **OK** +- Verification prompt describes multi-chunk synthesis and semantic entailment — **OK** +- User query is filtered out from `unsupported_claims` (so we don’t show the question as a claim) — **OK** +- System prompt says "fair" and "Do NOT require exact word-for-word" — **OK** +- User prompt line "Be fair but critical..." was relaxed to "Be FAIR: accept when chunks collectively support..." — **done** + +**Potential bugs:** +- If the verifier LLM is conservative, it may still return `verifier_passed: false` despite fair instructions. Temperature is 0.1 — **OK** +- When `invalid_chunk_ids` is non-empty (cited chunk not in reranked_docs), we force `passed = False` and add those claims to unsupported — **correct** + +**Connection:** Reads `draft_answer`, `citations`, `reranked_docs`; writes `verifier_passed`, `unsupported_claims`. Loop controller reads `verifier_passed`, `search_count`, `max_search`. + +**Could cause graceful failure?** Yes. Overly strict interpretation of "support" or multi-chunk synthesis causes valid answers to be rejected. + +--- + +## 6. app/nodes/loop_controller.py + +**Purpose:** After verifier, route to END, retry (increment_search → query_optimizer), or graceful_failure. + +**Checks:** +- `verifier_passed` → end — **OK** +- `search_count < max_search` → increment_search — **OK** +- Else → graceful_failure — **OK** +- `increment_search_node` returns `search_count + 1` — **OK**; LangGraph merges into state + +**Fix applied:** Use `state.get("search_count", 0)` and `state.get("max_search", 2)` so missing keys don’t cause KeyError. + +**Potential bugs:** None. Loop is not infinite: after `max_search` increments we hit graceful_failure. + +**Connection:** Conditional edges from verifier; increment_search → query_optimizer. + +**Could cause graceful failure?** Only by design: when retries are exhausted we intentionally go to graceful_failure. + +--- + +## 7. app/graph/graph.py + +**Purpose:** Define nodes and edges for the LangGraph pipeline. + +**Checks:** +- Topology: START → query_optimizer → retriever → generator → verifier → conditional → end / increment_search / graceful_failure — **OK** +- increment_search → query_optimizer — **OK** +- graceful_failure → END — **OK** + +**Potential bugs:** None. + +**Could cause graceful failure?** No. Topology is correct. + +--- + +## 8. main.py + +**Purpose:** Invoke graph, print draft answer, citations, verifier warning, unsupported claims. + +**Checks:** +- Displays multi-chunk citations (chunk_ids list or chunk_id string) — **OK** +- Shows verifier_passed via "Warning: The answer may be unreliable..." when not passed — **OK** + +**Potential bugs:** When the run ends at graceful_failure, `result["draft_answer"]` is the overwritten generic message, so you never see the last generator answer. Citations and unsupported_claims are still from the last verifier run. So debugging "why did verifier fail?" requires either logging the last draft/citations before overwrite or not overwriting draft on graceful_failure (optional). + +**Could cause graceful failure?** No. Display only. + +--- + +## Root causes of "Available evidence does not sufficiently support a reliable answer" + +1. **Verifier fails** on every attempt (so after max_search retries we go to graceful_failure). +2. **Why verifier fails (pick one or more):** + - **Wrong or missing chunk in retrieval:** The chunk that says "bd and s & e activities are limited to largely manual and time-consuming tasks" isn’t in top K → generator can’t cite it or says "evidence does not provide". + - **Query optimizer:** Queries don’t expand BD/S&E → retrieval doesn’t surface that chunk well → same as above. + - **Generator:** Answers with "evidence does not provide" or cites a chunk that doesn’t support the claim → verifier correctly fails. + - **Verifier too strict:** Even with the right chunk and a reasonable answer, verifier LLM returns `verifier_passed: false` (e.g. requires exact wording or doesn’t accept synthesis). + +## Fixes applied in code + +- **state.py:** Comments that initial_state must be complete; return plain dict (unchanged behavior). +- **query_optimizer.py:** Prompt instruction to expand abbreviations (BD, S&E, etc.) for recall. +- **retriever.py:** `RERANK_TOP_K` increased from 7 to 10. +- **generator.py:** Stronger prompt: must answer and cite when evidence contains the answer; always include citations. +- **verifier.py:** User prompt relaxed to "Be FAIR: accept when chunks collectively support...". +- **loop_controller.py:** Defensive `state.get("search_count", 0)` and `state.get("max_search", 2)`. + +## Format mismatch check + +- Generator outputs citations with **chunk_ids** (list) only (parser normalizes old `chunk_id` to list). +- Verifier **_citation_chunk_ids()** reads both **chunk_id** and **chunk_ids** and returns a list of chunk IDs. +- So there is **no format mismatch** between generator output and verifier input. + +## Suggested next steps + +1. Re-run ingest if you changed chunk size (so Chroma has 500-token chunks). +2. Run the same query and confirm query_optimizer returns at least one query with "business development" or "s & e". +3. Add temporary logging: in verifier_node, log `draft_answer`, `citations`, and verifier LLM response (or at least `passed`) so you can see why it failed. +4. Optionally: in the graceful_failure node, do *not* overwrite `draft_answer` so the final result still shows what the generator said (and you can see what was rejected). diff --git a/app/graph/graph.py b/app/graph/graph.py index d0a6760..82eeadc 100644 --- a/app/graph/graph.py +++ b/app/graph/graph.py @@ -7,7 +7,7 @@ from app.graph.state import GraphState from app.nodes.generator import generator_node -from app.nodes.loop_controller import loop_controller +from app.nodes.loop_controller import increment_search_node, loop_controller from app.nodes.query_optimizer import query_optimizer_node from app.nodes.retriever import retriever_node from app.nodes.verifier import verifier_node @@ -20,6 +20,7 @@ builder.add_node("retriever", retriever_node) builder.add_node("generator", generator_node) builder.add_node("verifier", verifier_node) +builder.add_node("increment_search", increment_search_node) builder.add_node( "graceful_failure", lambda state: { @@ -38,10 +39,11 @@ loop_controller, { "end": END, - "query_optimizer": "query_optimizer", + "increment_search": "increment_search", "graceful_failure": "graceful_failure", }, ) +builder.add_edge("increment_search", "query_optimizer") builder.add_edge("graceful_failure", END) graph = builder.compile() diff --git a/app/graph/state.py b/app/graph/state.py index 9956184..58d61a5 100644 --- a/app/graph/state.py +++ b/app/graph/state.py @@ -1,6 +1,7 @@ from typing import TypedDict, List, Optional class GraphState(TypedDict): + """State for ReasonGraph. initial_state() must set every key so loop_controller never sees missing search_count/max_search.""" user_query: str optimized_queries: List[str] retrieved_docs: List[dict] @@ -12,16 +13,18 @@ class GraphState(TypedDict): search_count: int max_search: int + def initial_state(user_query: str) -> GraphState: - return GraphState( - user_query=user_query, - optimized_queries=[], - retrieved_docs=[], - reranked_docs=[], - draft_answer=None, - citations=[], - verifier_passed=False, - unsupported_claims=[], - search_count=0, - max_search=3 - ) \ No newline at end of file + """Complete initial state. search_count and max_search are required for retry/graceful_failure routing.""" + return { + "user_query": user_query, + "optimized_queries": [], + "retrieved_docs": [], + "reranked_docs": [], + "draft_answer": None, + "citations": [], + "verifier_passed": False, + "unsupported_claims": [], + "search_count": 0, + "max_search": 2, + } \ No newline at end of file diff --git a/app/nodes/generator.py b/app/nodes/generator.py index 2fa6554..9effca4 100644 --- a/app/nodes/generator.py +++ b/app/nodes/generator.py @@ -1,17 +1,18 @@ """ -LangGraph generator node for ReasonGraph: draft answer with citations via Gemini. +LangGraph generator node for ReasonGraph: draft answer with citations via Groq. +ENHANCED: Supports multi-chunk citations for claims spanning multiple evidence chunks. """ import json import os import re -from google import genai -from google.genai import types +from groq import Groq from app.graph.state import GraphState -_client = genai.Client(api_key=os.environ["GEMINI_API_KEY"]) +_client = Groq(api_key=os.environ.get("GROQ_API_KEY", "")) +_MODEL = "llama-3.3-70b-versatile" def _format_evidence(docs: list[dict]) -> str: @@ -25,7 +26,18 @@ def _format_evidence(docs: list[dict]) -> str: def _build_prompt(user_query: str, evidence_block: str) -> str: - return f"""You are an answer generator for an enterprise RAG system. Use ONLY the evidence below to answer the user query. Do not use prior knowledge. Every claim in your answer must be supported by the evidence and cited inline using the chunk_id in square brackets, e.g. [doc_p1_c0]. + return f"""You are an answer generator for an enterprise RAG system. Use ONLY the evidence below to answer the user query. Do not use prior knowledge. Every claim in your answer must be supported by the evidence and cited inline using chunk_id(s) in square brackets. + +CITATION RULES: +- Single chunk claim: [chunk_id] when one chunk fully supports the claim +- Multi-chunk claim: [chunk_id1, chunk_id2] when a claim requires combining chunks + Example: If chunk_id "doc_p1" says "BD activities are manual" and chunk_id "doc_p2" says "S&E activities are time-consuming", and you want to say "BD and S&E are limited to manual and time-consuming tasks", cite [doc_p1, doc_p2] + +IMPORTANT: +- Treat equivalent phrasings as the same: "bd"/"BD", "s & e"/"s&e"/"S&E", "business development" +- Match abbreviations and different spacing in evidence to user's terms +- If evidence clearly states the answer (even with different spelling or phrasing), you MUST provide that answer and cite the chunk(s). Do NOT respond that "evidence does not provide" when the evidence does contain the answer. +- Always include a "citations" array with at least one entry per factual claim, using the exact chunk_id(s) from the EVIDENCE block. EVIDENCE: {evidence_block} @@ -35,16 +47,21 @@ def _build_prompt(user_query: str, evidence_block: str) -> str: Respond with ONLY a single JSON object (no markdown, no preamble, no explanation): {{ - "answer": "full answer text with inline citations like [chunk_id]", + "answer": "full answer text with inline citations like [chunk_id] or [chunk_id1, chunk_id2]", "citations": [ - {{ "claim": "short claim text", "chunk_id": "chunk_id string" }} + {{ "claim": "short claim text", "chunk_ids": ["chunk_id"] }}, + {{ "claim": "another claim spanning chunks", "chunk_ids": ["chunk_id1", "chunk_id2"] }} ] }} """ def _parse_response(text: str) -> tuple[str, list[dict]]: - """Parse model response into (draft_answer, citations). On failure return ('Generation failed.', []).""" + """ + Parse model response into (draft_answer, citations). + Handles both old format (chunk_id: string) and new format (chunk_ids: list). + On failure return ('Generation failed.', []) + """ if not text or not text.strip(): return "Generation failed.", [] raw = text.strip() @@ -64,23 +81,51 @@ def _parse_response(text: str) -> tuple[str, list[dict]]: citations_raw = parsed.get("citations") if not isinstance(citations_raw, list): return draft_answer, [] + citations = [] for c in citations_raw: if isinstance(c, dict): claim = c.get("claim") + if claim is None: + continue + + # Support both old (chunk_id: string) and new (chunk_ids: list) formats chunk_id = c.get("chunk_id") - if claim is not None and chunk_id is not None: + chunk_ids = c.get("chunk_ids") + + # Normalize to chunk_ids (list) + if chunk_id is not None: + # Old format: convert string to list + chunk_ids = [str(chunk_id).strip()] + elif chunk_ids is not None: + # New format: ensure list of strings + if isinstance(chunk_ids, list): + chunk_ids = [str(cid).strip() for cid in chunk_ids if cid is not None] + else: + chunk_ids = [str(chunk_ids).strip()] + else: + # No chunk reference found + continue + + if chunk_ids: citations.append({ "claim": str(claim).strip(), - "chunk_id": str(chunk_id).strip(), + "chunk_ids": chunk_ids, }) + return draft_answer, citations except (json.JSONDecodeError, ValueError, TypeError, AttributeError): return "Generation failed.", [] def generator_node(state: GraphState) -> GraphState: - """Generate draft answer and citations from reranked_docs using Gemini. Populates draft_answer and citations.""" + """ + Generate draft answer and citations from reranked_docs using Groq. + Populates draft_answer and citations. + + ENHANCED: Supports multi-chunk citations (chunk_ids as list). + Maintains backward compatibility with single chunk citations. + """ user_query = state.get("user_query") or "" reranked_docs = state.get("reranked_docs") or [] @@ -94,12 +139,12 @@ def generator_node(state: GraphState) -> GraphState: prompt = _build_prompt(user_query, evidence_block) try: - response = _client.models.generate_content( - model="gemini-2.5-flash", - contents=prompt, + response = _client.chat.completions.create( + model=_MODEL, + messages=[{"role": "user", "content": prompt}], ) - text = (response.text or "").strip() + text = (response.choices[0].message.content or "").strip() draft_answer, citations = _parse_response(text) return {"draft_answer": draft_answer, "citations": citations} except Exception: - return {"draft_answer": "Generation failed.", "citations": []} + return {"draft_answer": "Generation failed.", "citations": []} \ No newline at end of file diff --git a/app/nodes/loop_controller.py b/app/nodes/loop_controller.py index fa67b0f..3b8175b 100644 --- a/app/nodes/loop_controller.py +++ b/app/nodes/loop_controller.py @@ -2,11 +2,18 @@ def loop_controller(state: GraphState) -> str: - if state["verifier_passed"]: + """Router only: do not mutate state here; LangGraph does not persist router mutations.""" + if state.get("verifier_passed", False): return "end" - if state["search_count"] < state["max_search"]: - state["search_count"] += 1 - return "query_optimizer" + search_count = state.get("search_count", 0) + max_search = state.get("max_search", 2) + if search_count < max_search: + return "increment_search" return "graceful_failure" + + +def increment_search_node(state: GraphState) -> dict: + """Increment search_count so the retry limit is enforced. Must be a node so the update persists.""" + return {"search_count": state.get("search_count", 0) + 1} diff --git a/app/nodes/query_optimizer.py b/app/nodes/query_optimizer.py index c3370a3..64b6aad 100644 --- a/app/nodes/query_optimizer.py +++ b/app/nodes/query_optimizer.py @@ -1,17 +1,17 @@ """ -Query optimizer node for ReasonGraph: produces 3 optimized search queries via Gemini. +Query optimizer node for ReasonGraph: produces 3 optimized search queries via Groq. """ import json import os import re -from google import genai -from google.genai import types +from groq import Groq from app.graph.state import GraphState -_client = genai.Client(api_key=os.environ["GEMINI_API_KEY"]) +_client = Groq(api_key=os.environ.get("GROQ_API_KEY", "")) +_MODEL = "llama-3.3-70b-versatile" _QUERY_OPTIMIZER_PROMPT = """You are a search query optimizer for an enterprise RAG system. @@ -20,6 +20,7 @@ 2. Preserve or extract any version numbers, product names, or filters mentioned in the query. 3. Maximize recall and reduce noise: use clear keywords and avoid vague terms. 4. Each of the 3 queries should be a distinct variant (e.g., different phrasings or focus). +5. For abbreviations (e.g. BD, S&E), include expanded or alternate forms in at least one query to improve recall (e.g. "business development", "s & e", "sales and engineering") so document phrasing variations are matched. Output ONLY a JSON array of exactly 3 strings. No other text, no markdown, no explanation. @@ -36,11 +37,11 @@ def query_optimizer_node(state: GraphState) -> GraphState: prompt = _QUERY_OPTIMIZER_PROMPT.format(user_query=user_query.strip()) try: - response = _client.models.generate_content( - model="gemini-2.5-flash", - contents=prompt, + response = _client.chat.completions.create( + model=_MODEL, + messages=[{"role": "user", "content": prompt}], ) - text = (response.text or "").strip() + text = (response.choices[0].message.content or "").strip() # Allow optional markdown code fence wrapper if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) @@ -50,7 +51,7 @@ def query_optimizer_node(state: GraphState) -> GraphState: queries = [str(q).strip() for q in parsed[:3] if q] if len(queries) < 3: queries.extend([user_query] * (3 - len(queries))) - return {"optimized_queries": [user_query, user_query, user_query]} + return {"optimized_queries": queries[:3]} except (json.JSONDecodeError, ValueError, KeyError, AttributeError): pass diff --git a/app/nodes/retriever.py b/app/nodes/retriever.py index f5e37eb..2a85e48 100644 --- a/app/nodes/retriever.py +++ b/app/nodes/retriever.py @@ -16,7 +16,7 @@ COLLECTION_NAME = "reasongraph_docs" VECTOR_TOP_K = 10 BM25_TOP_K = 10 -RERANK_TOP_K = 5 +RERANK_TOP_K = 10 _embedding_model: SentenceTransformer | None = None _cross_encoder: CrossEncoder | None = None @@ -167,7 +167,7 @@ def _run_retrieval(state: GraphState) -> list[dict]: candidates = list(merged.values()) - # 4. RERANKING: cross-encoder, sort by score desc, top 5 + # 4. RERANKING: cross-encoder, sort by score desc, take top RERANK_TOP_K cross = _get_cross_encoder() pairs = [[c["content"], user_query] for c in candidates] ce_scores = cross.predict(pairs) @@ -183,7 +183,7 @@ def _run_retrieval(state: GraphState) -> list[dict]: def retriever_node(state: GraphState) -> GraphState: - """Populate state['reranked_docs'] with top 5 chunks from hybrid retrieval. On failure or empty pool, set to [].""" + """Populate state['reranked_docs'] with top RERANK_TOP_K chunks from hybrid retrieval. On failure or empty pool, set to [].""" try: reranked_docs = _run_retrieval(state) return {"reranked_docs": reranked_docs} diff --git a/app/nodes/verifier.py b/app/nodes/verifier.py index 4e571a4..83f6c1f 100644 --- a/app/nodes/verifier.py +++ b/app/nodes/verifier.py @@ -1,29 +1,55 @@ """ LangGraph verifier node for ReasonGraph: trust layer that audits generator output by cross-referencing citations against chunk content. + +ENHANCED: Supports multi-chunk claims and semantic entailment. """ import json import os import re -from google import genai -from google.genai import types +from groq import Groq from app.graph.state import GraphState -_VERIFIER_SYSTEM_PROMPT = """You are a strict evidence auditor for an enterprise RAG system. -Your only job is to verify whether claims are directly supported by the provided evidence chunks. -You are skeptical by default. You do not infer, assume, or give benefit of the doubt. -A claim is only supported if the cited chunk explicitly contains the information. +_VERIFIER_SYSTEM_PROMPT = """You are a fair evidence auditor for an enterprise RAG system. +Your only job is to verify whether claims are supported by the provided evidence chunks. + +YOU SHOULD ACCEPT: +- Claims explicitly stated in a single chunk +- Claims that logically follow from combining multiple chunks +- Claims that synthesize information across multiple chunks (e.g., 'X is Y and Z' where one chunk mentions X and Y, another mentions Z) +- Claims semantically entailed by chunk content (even if not word-for-word identical) +- Claims where the cited chunks establish the key facts, even if exact wording differs + +YOU SHOULD REJECT: +- Claims where cited chunks don't exist (MISSING_CHUNK status) +- Claims contradicted by the chunks +- Claims requiring knowledge outside the provided chunks +- Claims with no citations when factual + +APPROACH: Be FAIR not STRICT. If chunks collectively establish a claim through reasonable synthesis, accept it. +Do NOT require exact word-for-word matches. Do NOT reject because wording differs from the chunks. You are not the answer generator. You are its auditor.""" -_client = genai.Client(api_key=os.environ["GEMINI_API_KEY"]) +_client = Groq(api_key=os.environ.get("GROQ_API_KEY", "")) +_MODEL = "llama-3.3-70b-versatile" # Sentinel for generator failure messages so we don't require citations for them _FAILURE_PHRASES = ("generation failed", "no relevant evidence found") +def _normalize_chunk_id(raw: str) -> str: + """Strip whitespace and optional square brackets so citation chunk_id matches reranked_docs keys.""" + if not raw: + return "" + s = str(raw).strip() + if s.startswith("[") and s.endswith("]"): + s = s[1:-1].strip() + return s + + def _chunk_map(reranked_docs: list[dict]) -> dict[str, str]: """Build chunk_id -> content from reranked_docs.""" out = {} @@ -42,6 +68,19 @@ def _is_failure_answer(draft_answer: str) -> bool: return any(phrase in lower for phrase in _FAILURE_PHRASES) +def _citation_chunk_ids(c: dict) -> list[str]: + """Return list of chunk IDs for a citation (supports chunk_id string or chunk_ids list).""" + chunk_id = c.get("chunk_id") + chunk_ids = c.get("chunk_ids") + if chunk_ids is not None: + if isinstance(chunk_ids, list): + return [_normalize_chunk_id(str(cid)) for cid in chunk_ids if cid is not None] + return [_normalize_chunk_id(str(chunk_ids))] + if chunk_id is not None: + return [_normalize_chunk_id(str(chunk_id))] + return [] + + def _build_verification_prompt( user_query: str, draft_answer: str, @@ -53,16 +92,16 @@ def _build_verification_prompt( cited_claims = [] for i, c in enumerate(citations or [], start=1): claim = (c.get("claim") or "").strip() - chunk_id = (c.get("chunk_id") or "").strip() - content = chunk_map.get(chunk_id, "") - status = "MISSING_CHUNK" if chunk_id in invalid_chunk_ids else "OK" - cited_claims.append( - f"[{i}] claim: \"{claim}\" | chunk_id: {chunk_id} | status: {status}\n" - f" chunk content: {content if content else '(not in evidence)'}" - ) + cids = _citation_chunk_ids(c) + parts = [f"[{i}] claim: \"{claim}\" | chunk_ids: {cids}"] + for cid in cids: + status = "MISSING_CHUNK" if cid in invalid_chunk_ids else "OK" + content = chunk_map.get(cid, "") + parts.append(f" {cid} | status: {status}\n content: {content if content else '(not in evidence)'}") + cited_claims.append("\n".join(parts)) citations_block = "\n\n".join(cited_claims) if cited_claims else "(none)" - return f"""You are a verification layer for an enterprise RAG system. Your job is to decide whether the draft answer is fully supported by the evidence. + return f"""You are a verification layer for an enterprise RAG system. Your job is to decide whether the draft answer is supported by the evidence. USER QUERY: {user_query} @@ -73,17 +112,24 @@ def _build_verification_prompt( CITATIONS AND REFERENCED CHUNK CONTENT: {citations_block} -RULES: -1. If any citation has status MISSING_CHUNK, that claim is unsupported. -2. For citations with chunk content: the chunk content must semantically support the claim. If it does not, add the claim to unsupported_claims. -3. If the draft answer contains factual claims that are not covered by any citation above, add those claim texts to unsupported_claims. -4. verifier_passed must be true ONLY when ALL claims are supported and ALL chunk_ids exist in the evidence. Otherwise false. -5. confidence is a float between 0.0 and 1.0 reflecting how strongly the evidence supports the answer overall. +VERIFICATION RULES: +1. MISSING_CHUNK = automatic failure for that claim. Add to unsupported_claims. +2. For OK chunks: Does the chunk content support the claim? + - Explicit match = YES + - Semantic entailment = YES (e.g., if chunk says "manual tasks take hours" and claim is "limited to time-consuming work") + - Logical combination of chunks = YES (e.g., chunk 1 says "X is limited" and chunk 2 says "to manual tasks") + - Vague or contradictory = NO +3. Check for uncited factual claims in the answer that aren't covered by any citation = add those to unsupported_claims. +4. unsupported_claims must list only specific ASSERTIONS from the DRAFT ANSWER that lack evidence—NOT the user query. The user query is the question; do not put it in unsupported_claims. +5. verifier_passed = TRUE only if ALL claims are supported and ALL chunk_ids exist. Otherwise FALSE. +6. confidence = 0.0-1.0 reflecting overall evidence strength for the answer. + +Be FAIR: accept when chunks collectively support the claim (single or multi-chunk synthesis). Reject only when chunks clearly do not support or contradict the claim. Respond with ONLY a single JSON object (no markdown, no preamble, no explanation): {{ "verifier_passed": true or false, - "unsupported_claims": ["claim text", ...], + "unsupported_claims": ["specific claim from draft answer that lacks support", ...], "confidence": 0.0 }} """ @@ -115,6 +161,8 @@ def _parse_verifier_response(text: str) -> tuple[bool, list[str]]: def verifier_node(state: GraphState) -> GraphState: """Audit generator output: cross-reference every citation against chunk content. Sets verifier_passed and unsupported_claims on state. + + ENHANCED: Supports multi-chunk reasoning and semantic entailment. """ user_query = state.get("user_query") or "" draft_answer = state.get("draft_answer") or "" @@ -132,16 +180,16 @@ def verifier_node(state: GraphState) -> GraphState: chunk_map = _chunk_map(reranked_docs) invalid_chunk_ids = [] for c in citations: - chunk_id = (c.get("chunk_id") or "").strip() - if chunk_id and chunk_id not in chunk_map: - invalid_chunk_ids.append(chunk_id) + for cid in _citation_chunk_ids(c): + if cid and cid not in chunk_map: + invalid_chunk_ids.append(cid) # If any chunk_id is missing, we can short-circuit: verifier fails, collect unsupported claims if invalid_chunk_ids: unsupported_from_bad_ids = [ (c.get("claim") or "").strip() for c in citations - if ((c.get("chunk_id") or "").strip() in invalid_chunk_ids) + if any(cid in invalid_chunk_ids for cid in _citation_chunk_ids(c)) ] unsupported_from_bad_ids = [u for u in unsupported_from_bad_ids if u] # Still call model to catch semantic misalignment and uncited claims; merge unsupported after @@ -157,15 +205,15 @@ def verifier_node(state: GraphState) -> GraphState: ) try: - response = _client.models.generate_content( - model="gemini-2.5-flash", - contents=prompt, - config=types.GenerateContentConfig( - system_instruction=_VERIFIER_SYSTEM_PROMPT, - temperature=0.1, - ), + response = _client.chat.completions.create( + model=_MODEL, + messages=[ + {"role": "system", "content": _VERIFIER_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + temperature=0.1, ) - text = (response.text or "").strip() + text = (response.choices[0].message.content or "").strip() passed, model_unsupported = _parse_verifier_response(text) except Exception: passed = False @@ -176,4 +224,24 @@ def verifier_node(state: GraphState) -> GraphState: if not pass_through: passed = False - return {"verifier_passed": passed, "unsupported_claims": all_unsupported} + # Do not report the user query itself as an unsupported claim (verifier sometimes echoes the question) + def _is_same_as_query(claim: str) -> bool: + c = (claim or "").strip().lower() + q = (user_query or "").strip().lower() + if not q or not c: + return False + # Exact match, or query with trailing ? removed + if c == q or c == q.rstrip("?").strip(): + return True + # Claim is only the query with "what is" / "what are" removed (e.g. "BD and S&E activities limited to") + c_words = set(c.split()) + q_words = set(q.rstrip("?").split()) + if c_words <= q_words or q_words <= c_words: + return len(c_words) < 20 and len(q_words) < 20 # avoid stripping real short claims + return False + + user_q = (user_query or "").strip().lower() + if user_q: + all_unsupported = [u for u in all_unsupported if not _is_same_as_query(u)] + + return {"verifier_passed": passed, "unsupported_claims": all_unsupported} \ No newline at end of file diff --git a/docs/arxiv_reasoning.pdf b/app/retrieval/docs/2602.15019v2.pdf similarity index 86% rename from docs/arxiv_reasoning.pdf rename to app/retrieval/docs/2602.15019v2.pdf index 7436e5e..6d5e85b 100644 Binary files a/docs/arxiv_reasoning.pdf and b/app/retrieval/docs/2602.15019v2.pdf differ diff --git a/app/retrieval/hybrid_search.py b/app/retrieval/hybrid_search.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/retrieval/ingest.py b/app/retrieval/ingest.py index 9b53609..0297787 100644 --- a/app/retrieval/ingest.py +++ b/app/retrieval/ingest.py @@ -6,12 +6,35 @@ """ import re +import tempfile from pathlib import Path import chromadb -from pypdf import PdfReader +import fitz from sentence_transformers import SentenceTransformer + +def extract_pdf_text(file_path: str) -> str: + """Extract text from a PDF. Uses a temp copy to avoid path locks and PyMuPDF stream issues.""" + path = Path(file_path) + if not path.is_file(): + raise FileNotFoundError(f"PDF not found: {path}") + + # Validate PDF header before processing + data = path.read_bytes() + if not data.startswith(b'%PDF'): + raise ValueError(f"Invalid PDF header: file does not start with '%PDF'") + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp: + tmp.write(data) + tmp.flush() + doc = fitz.open(tmp.name) + text = "" + for page in doc: + text += page.get_text() + doc.close() + return text + CHROMA_PATH = "./chroma_db" COLLECTION_NAME = "reasongraph_docs" CHUNK_SIZE_TOKENS = 500 @@ -129,6 +152,8 @@ def _ingest_directory(docs_dir: str | Path) -> int: ) total_chunks = 0 + failed_files = [] + for pdf_path in pdf_files: try: relative_path = pdf_path.relative_to(docs_path) @@ -140,28 +165,32 @@ def _ingest_directory(docs_dir: str | Path) -> int: if file_slug.lower().endswith(".pdf"): file_slug = file_slug[:-4] - reader = PdfReader(pdf_path) + raw_text = extract_pdf_text(str(pdf_path)) + if not raw_text or not raw_text.strip(): + print(f" {source_url}: no text extracted, skipping") + failed_files.append((source_url, "No text extracted")) + continue + + chunk_texts = _chunk_text(raw_text.strip(), tokenizer) all_chunk_texts: list[str] = [] all_metadatas: list[dict] = [] all_chunk_ids: list[str] = [] - for page_num, page in enumerate(reader.pages, start=1): - raw_text = page.extract_text() or "" - chunk_texts = _chunk_text(raw_text, tokenizer) - for ci, chunk_text in enumerate(chunk_texts): - chunk_id = f"{file_slug}_p{page_num}_c{ci}" - all_chunk_ids.append(chunk_id) - all_chunk_texts.append(chunk_text) - all_metadatas.append({ - "chunk_id": chunk_id, - "source_url": source_url, - "page_number": page_num, - "version": version, - "char_count": len(chunk_text), - }) + for ci, chunk_text in enumerate(chunk_texts): + chunk_id = f"{file_slug}_c{ci}" + all_chunk_ids.append(chunk_id) + all_chunk_texts.append(chunk_text) + all_metadatas.append({ + "chunk_id": chunk_id, + "source_url": source_url, + "page_number": 1, + "version": version, + "char_count": len(chunk_text), + }) if not all_chunk_texts: print(f" {source_url}: no text extracted, skipping") + failed_files.append((source_url, "No chunks generated")) continue embeddings = model.encode( @@ -177,13 +206,29 @@ def _ingest_directory(docs_dir: str | Path) -> int: ) total_chunks += len(all_chunk_ids) print(f" {source_url}: stored {len(all_chunk_ids)} chunk(s)") + except (ValueError, FileNotFoundError) as e: + # Invalid PDF or file not found - log and continue + print(f" {pdf_path.name}: invalid file - {e}") + failed_files.append((pdf_path.name, str(e))) except Exception as e: - print(f" {pdf_path}: error - {e}") - + # Other unexpected errors + print(f" {pdf_path.name}: error - {e}") + failed_files.append((pdf_path.name, str(e))) + + if failed_files: + print(f"\n{len(failed_files)} file(s) failed:") + for filename, reason in failed_files: + print(f" - {filename}: {reason}") + return total_chunks if __name__ == "__main__": - docs_dir = "./docs" + # Resolve docs relative to project root (ReasonGraph/) so it works from any cwd + _script_dir = Path(__file__).resolve().parent # app/retrieval + _project_root = _script_dir.parent.parent.parent # ReasonGraph + docs_dir = _project_root / "docs" + if not docs_dir.is_dir(): + docs_dir = Path("./docs").resolve() total = _ingest_directory(docs_dir) - print(f"Total chunks stored: {total}") + print(f"\nTotal chunks stored: {total}") \ No newline at end of file diff --git a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/data_level0.bin b/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/data_level0.bin deleted file mode 100644 index 0ec5e2f..0000000 Binary files a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/data_level0.bin and /dev/null differ diff --git a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/header.bin b/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/header.bin deleted file mode 100644 index bb54792..0000000 Binary files a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/header.bin and /dev/null differ diff --git a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/length.bin b/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/length.bin deleted file mode 100644 index cb3e162..0000000 Binary files a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/length.bin and /dev/null differ diff --git a/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/link_lists.bin b/chroma_db/7bfa883a-5699-46b1-831e-8e8b79a32563/link_lists.bin deleted file mode 100644 index e69de29..0000000 diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3 deleted file mode 100644 index 6aba642..0000000 Binary files a/chroma_db/chroma.sqlite3 and /dev/null differ diff --git a/main.py b/main.py index 1a029a9..5982967 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ """ Entry point for ReasonGraph. +ENHANCED: Supports multi-chunk citations display. """ import sys @@ -18,13 +19,31 @@ def main() -> None: print(draft) print() - # Citations: [chunk_id] → claim + # Citations: [chunk_id] or [chunk_id1, chunk_id2] → claim citations = result.get("citations") or [] if citations: for c in citations: - chunk_id = c.get("chunk_id") or "" claim = c.get("claim") or "" - print(f"[{chunk_id}] → {claim}") + + # Support both old format (chunk_id: string) and new format (chunk_ids: list) + chunk_id = c.get("chunk_id") + chunk_ids = c.get("chunk_ids") + + # Normalize to display string + if chunk_ids is not None: + # New format: chunk_ids is a list + if isinstance(chunk_ids, list): + chunk_str = ", ".join(str(cid) for cid in chunk_ids) + else: + chunk_str = str(chunk_ids) + elif chunk_id is not None: + # Old format: chunk_id is a string + chunk_str = str(chunk_id) + else: + chunk_str = "unknown" + + # Display citation + print(f"[{chunk_str}] → {claim}") print() # Verifier warning @@ -42,3 +61,4 @@ def main() -> None: if __name__ == "__main__": main() + diff --git a/rag_simple_test.py b/rag_simple_test.py new file mode 100644 index 0000000..ac4ab22 --- /dev/null +++ b/rag_simple_test.py @@ -0,0 +1,64 @@ +""" +Simple RAG test: retrieve + generate answer only. No citations, no verifier. +Run from ReasonGraph/: python rag_simple_test.py "your question" +Requires: GROQ_API_KEY set; chroma_db populated (run ingest first). +""" +import os +import sys + +# Avoid tokenizers fork warning when using sentence-transformers +os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + +from groq import Groq + +# Reuse retriever logic +from app.graph.state import GraphState +from app.nodes.retriever import _run_retrieval + +_client = Groq(api_key=os.environ.get("GROQ_API_KEY", "")) +_MODEL = "llama-3.3-70b-versatile" + + +def main() -> None: + query = sys.argv[1] if len(sys.argv) > 1 else "What is this document about?" + state: GraphState = { + "user_query": query, + "optimized_queries": [query], # no query expansion for this test + "retrieved_docs": [], + "reranked_docs": [], + "draft_answer": None, + "citations": [], + "verifier_passed": False, + "unsupported_claims": [], + "search_count": 0, + "max_search": 2, + } + # Retrieve + docs = _run_retrieval(state) + if not docs: + print("No chunks retrieved. Check chroma_db and collection.") + return + context = "\n\n".join((d.get("content") or "").strip() for d in docs) + # Generate (no citations) + prompt = f"""Use ONLY the context below to answer the question. Do not use prior knowledge. Reply with a short, direct answer. No citations or references. + +CONTEXT: +{context} + +QUESTION: +{query} + +ANSWER:""" + try: + r = _client.chat.completions.create( + model=_MODEL, + messages=[{"role": "user", "content": prompt}], + ) + answer = (r.choices[0].message.content or "").strip() + print(answer) + except Exception as e: + print(f"Error: {e}") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index b256b97..ac42a47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,11 +3,13 @@ # Graph orchestration langgraph -# LLM (Gemini) – query optimizer, generator, verifier -google-generativeai +# LLM – query optimizer, generator, verifier (Groq; was Gemini) +groq -# Retrieval: vector store, embeddings, BM25, PDF +# Retrieval: vector store, embeddings, BM25, PDF (pymupdf primary, pypdf fallback) chromadb +pymupdf pypdf rank_bm25 sentence-transformers +