From f20848ccb6f7dabc3c743c9ff894580956fec311 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:27:20 -0400
Subject: [PATCH 1/8] integrated git grep to work alongside rag, plus several
 new RAG functionalities

---
 .github/workflows/pr-summary-agent.yml |  14 +-
 PRChangesTest.patch                    | 348 +++++++++++++++++++++++++
 pr_agent/PR_agent.py                   |  59 ++---
 pr_agent/code_rag_agent.py             |   2 +-
 pr_agent/code_rag_agent.txt            |  88 -------
 pr_agent/git_grep_agent.py             |  52 +---
 pr_agent/rag_sub_agent.py              |   0
 src/agentic/tools/rag_tool.py          |  41 ++-
 src/agentic/utils/rag_helper.py        |  23 +-
 9 files changed, 452 insertions(+), 175 deletions(-)
 create mode 100644 PRChangesTest.patch
 delete mode 100644 pr_agent/code_rag_agent.txt
 delete mode 100644 pr_agent/rag_sub_agent.py

diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index 1fe312bf..a1f85441 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -18,7 +18,13 @@ jobs:
 
       - name: Install uv
         uses: astral-sh/setup-uv@v6
-        
+
+      - name: Download a single artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: weaviate
+          path: ${{ HOME }}/.cache/weaviate
+
       - name: Run agent
         run: |
           uv venv --python 3.12
@@ -33,3 +39,9 @@ jobs:
           GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }}
           REPO_OWNER: supercog-ai
           REPO_NAME: PR_code_review-agent
+      
+       -name: Update weaviate cache
+        uses: actions/upload-artifact@v4
+        with:
+          name: weaviate
+          path: ${{ HOME }}/.cache/weaviate
diff --git a/PRChangesTest.patch b/PRChangesTest.patch
new file mode 100644
index 00000000..80fe4bfc
--- /dev/null
+++ b/PRChangesTest.patch
@@ -0,0 +1,348 @@
+diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
+index 88e50da..1fe312b 100644
+--- a/.github/workflows/pr-summary-agent.yml
++++ b/.github/workflows/pr-summary-agent.yml
+@@ -25,10 +25,8 @@ jobs:
+           uv pip install -e "../PR_code_review-agent[all,dev]" --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-first-match
+           git diff --merge-base HEAD^1 HEAD > PRChanges.patch
+           cat PRChanges.patch
+-          uv run pr_agent/test_files/mock_pr_agent.py
+-          
+-#uv run pr_agent/PR_agent.py  
+-
++          uv run pr_agent/PR_agent.py
++    
+         env: 
+           OPENAI_API_KEY: ${{ secrets.PRAgentOpenAIKey }}
+           PR_ID: ${{ github.event.pull_request.number }}
+diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
+index 46b8f1f..c99d863 100644
+--- a/pr_agent/PR_agent.py
++++ b/pr_agent/PR_agent.py
+@@ -29,14 +29,6 @@ class SearchResult(BaseModel):
+     similarity_score: float = Field(
+         desciption="Similarity score returned from vector search."
+     )
+-    is_relevant: bool = Field(
+-        default = True,
+-        description="Boolean describing if the search result is relevant to the query."
+-    )
+-    relevance_reason: str = Field(
+-        default = "",
+-        description="Boolean describing if the search result is relevant to the query."
+-    )
+     included_defs: List[str] = Field(
+         default_factory=list,
+         desciption="Similarity score returned from vector search."
+@@ -49,7 +41,6 @@ class Searches(BaseModel):
+ 
+ class RelevanceResult(BaseModel):
+     relevant: bool 
+-    reason: str
+ 
+ class PRReviewAgent(Agent):
+ 
+@@ -81,7 +72,7 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
+ 
+         self.relevanceAgent = Agent(
+             name="Code Relevange Agent",
+-            instructions="""You are an expert in determining if a snippet of code or documentation is needed to determine the purpose of a code change from the patch file. Your response must include a 'relevant' field boolean and a 'reason' field with a brief explanation.""",
++            instructions="""You are an expert in determining if a snippet of code or documentation is directly relevant to a query. Your response must include a 'relevant' field boolean.""",
+             model=GPT_4O_MINI,
+             result_model=RelevanceResult,
+         )
+@@ -142,10 +133,10 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
+             }
+         )
+ 
+-        print("quer"+str(queries))
++        print("queries: "+str(queries))
+ 
+-        all_results = []
+-    
++        # RAG queries
++        all_results = {}
+         for query in queries.searches[:10]:
+             searchResponse = yield from self.code_rag_agent.final_result(
+                 f"Search codebase",
+@@ -156,35 +147,34 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
+             )
+             
+             # Process each result
+-            for result in searchResponse.sections:
+-                all_results.append(SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs))
++            for key, result in searchResponse.sections.items():
++                if not key in all_results:
++                    all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)
+ 
+-        print("fil"+str(all_results))
++        print("all: "+str(all_results))
+ 
+         # Filter search results using LLM-based relevance checking
+         filtered_results = []
+-        
+-        for result in all_results: 
+-            if result.similarity_score < 0.5:
+-                continue
+-                
+-            relevance_check = yield from self.relevanceAgent.final_result(
+-                f"<Patch File>\n{request_context.get("patch_content")}\n</Patch File>\n\n<Content>{result.content}</Content><Query>{result.query}</Query>"
+-            )
+-
+-            print(relevance_check)
+-            
+-            result.is_relevant = relevance_check.relevant
+-            result.relevance_reason = relevance_check.reason
++        for result in all_results.values(): 
+             
+-            if result.is_relevant:
+-                filtered_results.append(result)
++            try:
++                relevance_check = yield from self.relevanceAgent.final_result(
++                    f"<Patch File>\n{request_context.get("patch_content")}\n</Patch File>\n\n<Content>{result.content}</Content><Query>{result.query}</Query>"
++                )
++                
++                if relevance_check.relevant:
++                    filtered_results.append(result)
++            except Exception as e:
++                # LLM error
++                print(e)
+ 
+-        print(str(filtered_results))
++        print("filtered: ",str(filtered_results))
+ 
+         # Prepare for summary
+         formatted_str = self.prepare_summary(request_context.get("patch_content"),filtered_results)
+ 
++        print(formatted_str)
++
+         summary = yield from self.summaryAgent.final_result(
+             formatted_str
+         )
+diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py
+index b7f36df..32d0a34 100644
+--- a/pr_agent/code_rag_agent.py
++++ b/pr_agent/code_rag_agent.py
+@@ -21,7 +21,7 @@ class CodeSection(BaseModel):
+     )
+ 
+ class CodeSections(BaseModel):
+-    sections: List[CodeSection] = Field(
++    sections: dict[str,CodeSection] = Field(
+         description="Sections of the codebase returned from the search.",
+     )
+     search_query: str = Field(
+@@ -46,7 +46,8 @@ class CodeRagAgent(Agent):
+ 
+         self.ragTool = RAGTool(
+                 default_index="codebase",
+-                index_paths=["../*.md","../*.py"],
++                index_paths=["../**/*.py","../**/*.md"],
++                recursive=True
+             )
+         
+ 
+@@ -65,24 +66,28 @@ class CodeRagAgent(Agent):
+         searchQuery = request_context.get("query")
+ 
+         searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5)
+-
+-        allSections = CodeSections(sections=[],search_query=query)
++        
++        allSections = CodeSections(sections={},search_query=query)
+ 
+         for nextResult in searchResult:
+-            print(nextResult)
+             file_path = nextResult["source_url"]
+-            similarity_score = nextResult["distance"] if nextResult["distance"] else 0
+-            content = nextResult["content"]
++            if not file_path in allSections.sections:
++                #print(nextResult)
++                
++                similarity_score = nextResult["distance"] if nextResult["distance"] else 0
++                content = nextResult["content"]
+ 
+-            # Only works with Python files
+-            included_defs = []
+-            try:
+-                with open(file_path) as file:
+-                    node = ast.parse(file.read())
+-                    included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
+-            except:
++                # Only works with Python files
+                 included_defs = []
++                try:
++                    with open(file_path) as file:
++                        node = ast.parse(file.read())
++                        included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
++                except:
++                    included_defs = []
+ 
+-            allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score))
++                allSections.sections[file_path] = CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score)
++            #else:
++                #print("Skipping Duplicate: ",file_path)
+ 
+         yield TurnEnd(self.name, [{"content": allSections}])
+diff --git a/src/agentic/events.py b/src/agentic/events.py
+index 01aa68e..da12979 100644
+--- a/src/agentic/events.py
++++ b/src/agentic/events.py
+@@ -656,7 +656,6 @@ class TurnEnd(Event):
+     def result(self):
+         """Safe result access with fallback"""
+         try:
+-            print(self.agent,self.messages)
+             return self.messages[-1]["content"] if self.messages else "No response generated"
+         except (IndexError, KeyError):
+             return "Error: Malformed response"
+diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py
+index e3e7280..9891e7c 100644
+--- a/src/agentic/tools/rag_tool.py
++++ b/src/agentic/tools/rag_tool.py
+@@ -14,6 +14,7 @@ from agentic.utils.rag_helper import (
+     init_embedding_model,
+     init_chunker,
+     rag_index_file,
++    rag_index_multiple_files,
+ )
+ 
+ from agentic.utils.summarizer import generate_document_summary
+@@ -44,6 +45,8 @@ class RAGTool(BaseAgenticTool):
+         # Construct the RAG tool. You can pass a list of files and we will ensure that
+         # they are added to the index on startup. Paths can include glob patterns also,
+         # like './docs/*.md'.
++        # Enable recursive (**.md) glob patterns with recursive = True
++
+         self.default_index = default_index
+         self.index_paths = index_paths
+         if self.index_paths:
+@@ -51,8 +54,11 @@ class RAGTool(BaseAgenticTool):
+             if default_index not in list_collections(client):
+                 create_collection(client, default_index, VectorDistances.COSINE)
+             for path in index_paths:
+-                for file_path in [path] if path.startswith("http") else glob.glob(path, recursive=recursive):
+-                    rag_index_file(file_path, self.default_index, client=client, ignore_errors=True)
++                if path.startswith("http"):
++                    rag_index_file(path, self.default_index, client=client, ignore_errors=True)
++                else:
++                    file_paths = glob.glob(path, recursive=recursive)
++                    rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True)
+ 
+     def get_tools(self) -> List[Callable]:
+         return [
+diff --git a/src/agentic/utils/file_reader.py b/src/agentic/utils/file_reader.py
+index 3ea5ae2..70671fb 100644
+--- a/src/agentic/utils/file_reader.py
++++ b/src/agentic/utils/file_reader.py
+@@ -57,6 +57,9 @@ def read_file(file_path: str, mime_type: str|None = None) -> tuple[str, str]:
+                 return text, mime_type
+         elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+             return pd.read_excel(file_path).to_csv(), mime_type
++        elif mime_type == "text/x-python":
++            with open(file_path,"r") as f:
++                return f.read(), mime_type
+         else:
+             return textract.process(file_path).decode('utf-8'), mime_type
+     except Exception as e:
+diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py
+index 94dd7f5..1d585f2 100644
+--- a/src/agentic/utils/rag_helper.py
++++ b/src/agentic/utils/rag_helper.py
+@@ -234,7 +234,96 @@ def rag_index_file(
+         if client and client_created:
+             client.close()
+     return "indexed"
++
++def rag_index_multiple_files(
++    file_paths: List[str],
++    index_name: str,
++    chunk_threshold: float = 0.5,
++    chunk_delimiters: str = ". ,! ,? ,\n",
++    embedding_model: str = "BAAI/bge-small-en-v1.5",
++    client: WeaviateClient|None = None,
++    ignore_errors: bool = False,
++    distance_metric: VectorDistances = VectorDistances.COSINE,
++):
++    """Index a file using configurable Weaviate Embedded and chunking parameters"""
++
++    console = Console()
++    client_created = False
++    try:
++        with Status("[bold green]Initializing Weaviate..."):
++            if client is None:
++                client = init_weaviate()
++                client_created = True
++            create_collection(client, index_name, distance_metric)
++            
++        with Status("[bold green]Initializing models..."):
++            embed_model = init_embedding_model(embedding_model)
++            chunker = init_chunker(chunk_threshold, chunk_delimiters)
+         
++        for file_path in file_paths:
++            with Status(f"[bold green]Processing {file_path}...", console=console):
++                text, mime_type = read_file(str(file_path))
++                metadata = prepare_document_metadata(file_path, text, mime_type, GPT_DEFAULT_MODEL)
++
++            console.print(f"[bold green]Indexing {file_path}...")
++
++            collection = client.collections.get(index_name)
++            exists, status = check_document_exists(
++                collection, 
++                metadata["document_id"],
++                metadata["fingerprint"]
++            )
++            
++            if status == "unchanged":
++                console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]")
++                continue
++            elif status == "duplicate":
++                console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]")
++                continue
++            elif status == "changed":
++                console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]")
++                collection.data.delete_many(
++                    where=Filter.by_property("document_id").equal(metadata["document_id"])
++                )
++
++            with Status("[bold green]Generating document summary...", console=console):
++                metadata["summary"] = generate_document_summary(
++                    text=text[:12000],
++                    mime_type=mime_type,
++                    model=GPT_DEFAULT_MODEL
++                )
++            
++            chunks = chunker(text)
++            chunks_text = [chunk.text for chunk in chunks]
++            if not chunks_text:
++                if ignore_errors:
++                    return client
++                raise ValueError("No text chunks generated from document")
++            
++            batch_size = 128
++            embeddings = []
++            with Status("[bold green]Generating embeddings..."):
++                for i in range(0, len(chunks_text), batch_size):
++                    batch = chunks_text[i:i+batch_size]
++                    embeddings.extend(list(embed_model.embed(batch)))
++            
++            with Status("[bold green]Indexing chunks..."), collection.batch.dynamic() as batch:
++                for i, chunk in enumerate(chunks):
++                    vector = embeddings[i].tolist()
++                    batch.add_object(
++                        properties={
++                            **metadata,
++                        "content": chunk.text,
++                        "chunk_index": i,
++                        },
++                        vector=vector
++                    )
++                
++        console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}")
++    finally:
++        if client and client_created:
++            client.close()
++    return "indexed"
+ 
+ def delete_document_from_index(
+     collection: Any,
diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
index 2f04d14b..68bcad58 100644
--- a/pr_agent/PR_agent.py
+++ b/pr_agent/PR_agent.py
@@ -137,34 +137,7 @@ def next_turn(
 
         print("queries: "+str(queries))
 
-
-
-        # # Git-Grep queries
-        # all_results = {}
-        # for query in queries.searches[:10]:
-        #     searchResponse = yield from self.git_grep_agent.final_result(
-        #         f"Search codebase with git grep",
-        #         request_context={
-        #             "query": query,
-        #             "thread_id": request_context.get("thread_id")
-        #         }
-        #     )
-            
-        #     # Process each result
-        #     # grep_response.sections is a list of CodeSection objects
-        #     for result in searchResponse.sections:
-        #         if result.file_path not in all_results:
-        #             all_results[result.file_path] = SearchResult(
-        #                 query=query,
-        #                 file_path=result.file_path,
-        #                 content=result.search_result,
-        #                 similarity_score=result.similarity_score,
-        #                 included_defs=result.included_defs
-        #     )
-
-
-
-        # RAG queries
+        # RAG and Git-Grep queries 
         all_results = {}
         for query in queries.searches[:10]:
             searchResponse = yield from self.code_rag_agent.final_result(
@@ -176,9 +149,30 @@ def next_turn(
             )
             
             # Process each result
-            for key, result in searchResponse.sections.items():
-                if not key in all_results:
-                    all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)
+            for file, result in searchResponse.sections.items():
+                if not file in all_results:
+                    all_results[file] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)
+            
+            searchResponse = yield from self.git_grep_agent.final_result(
+                f"Search codebase with git grep",
+                request_context={
+                    "query": query,
+                    "thread_id": request_context.get("thread_id")
+                }
+            )
+          
+            # Process each result
+            # grep_response.sections is a list of CodeSection objects
+            for file, result in searchResponse.sections.items():
+                if not file in all_results:
+                    all_results[file] = SearchResult(
+                        query=query,
+                        file_path=result.file_path,
+                        content=result.search_result,
+                        similarity_score=result.similarity_score,
+                        included_defs=result.included_defs
+            )
+
 
         print("all: "+str(all_results))
 
@@ -225,7 +219,8 @@ def next_turn(
 pr_review_agent = PRReviewAgent()
 
 if __name__ == "__main__":
-    with open("PRChanges.patch", "r") as f:
+    # Change to PRChanges.patch for deployment
+    with open("PRChangesTest.patch", "r") as f:
         patch_content = f.read()
     
     # Run the agent
diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py
index 32d0a346..474550ef 100644
--- a/pr_agent/code_rag_agent.py
+++ b/pr_agent/code_rag_agent.py
@@ -46,7 +46,7 @@ def __init__(self,
 
         self.ragTool = RAGTool(
                 default_index="codebase",
-                index_paths=["../**/*.py","../**/*.md"],
+                index_paths=["../**/*.md"],
                 recursive=True
             )
         
diff --git a/pr_agent/code_rag_agent.txt b/pr_agent/code_rag_agent.txt
deleted file mode 100644
index b7f36df7..00000000
--- a/pr_agent/code_rag_agent.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-from typing import Any, Generator, List
-from agentic.common import Agent, AgentRunner, ThreadContext
-from agentic.events import Event, ChatOutput, WaitForInput, Prompt, PromptStarted, TurnEnd, ResumeWithInput
-from agentic.models import GPT_4O_MINI # model (using GPT for testing)
-from pydantic import BaseModel, Field
-from agentic.tools.rag_tool import RAGTool
-import ast
-
-class CodeSection(BaseModel):
-    search_result: str = Field(
-        description="Part returned from search.",
-    )
-    file_path: str = Field(
-        description="Path of the file this code belongs to."
-    )
-    included_defs: list[str] = Field(
-        description="Classes and functions defined in this file."
-    )
-    similarity_score: float = Field(
-        desciption="Similarity score returned from vector search."
-    )
-
-class CodeSections(BaseModel):
-    sections: List[CodeSection] = Field(
-        description="Sections of the codebase returned from the search.",
-    )
-    search_query: str = Field(
-        description="Query used to return this section.",
-    )
-
-class CodeRagAgent(Agent):
-    def __init__(self,
-        name="Code Rag Agent",
-        welcome="I am the Code Rag Agent. Please give me a search query (function name,class name, etc.) and I'll return relevant parts of the code.",
-        model: str=GPT_4O_MINI, 
-        result_model = CodeSections,
-        **kwargs
-    ):
-        super().__init__(
-            name=name, 
-            welcome=welcome,
-            model=model,
-            result_model=result_model,
-            **kwargs
-        )
-
-        self.ragTool = RAGTool(
-                default_index="codebase",
-                index_paths=["../*.md","../*.py"],
-            )
-        
-
-    def next_turn(
-        self,
-        request: str|Prompt,
-        request_context: dict = {},
-        request_id: str = None,
-        continue_result: dict = {},
-        debug = "",
-    ) -> Generator[Event, Any, Any]:
-        
-        query = request.payload if isinstance(request, Prompt) else request
-        yield PromptStarted(query, {"query": query})
-
-        searchQuery = request_context.get("query")
-
-        searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5)
-
-        allSections = CodeSections(sections=[],search_query=query)
-
-        for nextResult in searchResult:
-            print(nextResult)
-            file_path = nextResult["source_url"]
-            similarity_score = nextResult["distance"] if nextResult["distance"] else 0
-            content = nextResult["content"]
-
-            # Only works with Python files
-            included_defs = []
-            try:
-                with open(file_path) as file:
-                    node = ast.parse(file.read())
-                    included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
-            except:
-                included_defs = []
-
-            allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score))
-
-        yield TurnEnd(self.name, [{"content": allSections}])
diff --git a/pr_agent/git_grep_agent.py b/pr_agent/git_grep_agent.py
index a766924a..7d0ee0f8 100644
--- a/pr_agent/git_grep_agent.py
+++ b/pr_agent/git_grep_agent.py
@@ -7,37 +7,7 @@
 import ast 
 import os
 
-
-# Defines strcutured data containers for the serach/query results 
-# each CodeSection object will represent one match from a git grep search 
-class CodeSection(BaseModel):
-    search_result: str = Field(
-        description="Matching line returned from git grep.",
-    )
-    file_path: str = Field(
-        description="Path of the file containing the match ."
-    )
-    included_defs: list[str] = Field(
-        description="Classes and functions defined in this file."
-    )
-    # consider making the similarity_score optional, since grep doesn't return a similarity score
-    similarity_score: float = Field(
-        description="Similarity score placeholder for git grep, default to 1.0" 
-    )   
-
-
-
-# Represents the collection of matches for one serach query
-class CodeSections(BaseModel):
-    # list of CodeSection objects 
-    sections: List[CodeSection] = Field(
-        description="Sections of the codebase returned from the git grep search.",
-    )
-    # This is the query used for git grep 
-    search_query: str = Field(
-        description="Query used to return this section.",
-    )
-
+from code_rag_agent import CodeSection, CodeSections
 
 # The actual sub-agent that runs git grep and returns structured results 
 class GitGrepAgent(Agent):
@@ -111,11 +81,11 @@ def next_turn(
 
 
         # TODO: verify that sections doesn't have to be a dictionary instead (like code_rag_agent implementation)
-        allSections = CodeSections(sections=[], search_query=search_query)          # creates an empty CodeSections object 
+        allSections = CodeSections(sections={}, search_query=search_query)          # creates an empty CodeSections object 
 
         # loops over each grep match
         for file_path, matched_line in grep_results:
-            if file_path not in allSections.sections:
+            if not file_path in allSections.sections:
                 included_defs = []
                 try:
                     if file_path.endswith(".py"):           # if a python file, parse the AST, and collect all function/class names 
@@ -125,17 +95,17 @@ def next_turn(
                                 n.name for n in node.body
                                 if isinstance (n, ast.ClassDef) or isinstance(n, ast.FunctionDef)
                             ]
+                    else:
+                        continue # ONLY search for .py files
 
                 except:
                     included_defs = []
 
-                # Only add if this file_path hasn’t already been added
-                if not any(sec.file_path == file_path for sec in allSections.sections):
-                    allSections.sections.append(CodeSection(
-                        search_result=matched_line,
-                        file_path=file_path,
-                        included_defs=included_defs,
-                        similarity_score=1.0  # grep doesn't do semantic scoring
-                    ))
+                allSections.sections[file_path] = CodeSection(
+                    search_result=matched_line,
+                    file_path=file_path,
+                    included_defs=included_defs,
+                    similarity_score=1.0  # grep doesn't do semantic scoring
+                )
 
         yield TurnEnd(self.name, [{"content": allSections}])
diff --git a/pr_agent/rag_sub_agent.py b/pr_agent/rag_sub_agent.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py
index 9891e7c9..b42bea28 100644
--- a/src/agentic/tools/rag_tool.py
+++ b/src/agentic/tools/rag_tool.py
@@ -15,6 +15,7 @@
     init_chunker,
     rag_index_file,
     rag_index_multiple_files,
+    delete_document_from_index,
 )
 
 from agentic.utils.summarizer import generate_document_summary
@@ -23,6 +24,7 @@
 from weaviate.collections.classes.grpc import Sort
 from weaviate.classes.config import VectorDistances
 
+from rich.console import Console
 
 @tool_registry.register(
     name="RAGTool",
@@ -41,6 +43,7 @@ def __init__(
         default_index: str = "knowledge_base",
         index_paths: list[str] = [],
         recursive: bool = False,
+        overwrite_index = True,
     ):
         # Construct the RAG tool. You can pass a list of files and we will ensure that
         # they are added to the index on startup. Paths can include glob patterns also,
@@ -53,12 +56,44 @@ def __init__(
             client = init_weaviate()
             if default_index not in list_collections(client):
                 create_collection(client, default_index, VectorDistances.COSINE)
+            
+            # Keep track of files found during initialization
+            if overwrite_index:
+                indexed_documents = {}
+
             for path in index_paths:
                 if path.startswith("http"):
-                    rag_index_file(path, self.default_index, client=client, ignore_errors=True)
+                    document_id = rag_index_file(path, self.default_index, client=client, ignore_errors=True)
+
+                    if overwrite_index: 
+                        indexed_documents[document_id] = True
+                    
                 else:
                     file_paths = glob.glob(path, recursive=recursive)
-                    rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True)
+                    document_ids = rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True)
+
+                    if overwrite_index:
+                        for document_id in document_ids:
+                            indexed_documents[document_id] = True
+
+            # Delete indexed files not found during initialization
+            if overwrite_index:
+                try:
+                    console = Console()
+                    collection = client.collections.get(self.default_index)
+                    documents = list_documents_in_collection(collection)
+
+                    for document in documents:
+                        if not document["document_id"] in indexed_documents:
+                            console.print(f"[bold green]✅ Removing deleted file {document["filename"]} from index")
+                            delete_document_from_index(collection=collection,document_id=document["document_id"],filename=document["filename"])
+
+                except Exception as e:
+                    print(f"Error listing documents: {str(e)}")
+                    return
+                finally:
+                    if client:
+                        client.close()
 
     def get_tools(self) -> List[Callable]:
         return [
@@ -66,7 +101,7 @@ def get_tools(self) -> List[Callable]:
             #self.list_indexes,
             self.search_knowledge_index,
             self.list_documents,
-            self.review_full_document
+            self.review_full_document,
         ]
 
     def save_content_to_knowledge_index(
diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py
index 1d585f2d..89a1f390 100644
--- a/src/agentic/utils/rag_helper.py
+++ b/src/agentic/utils/rag_helper.py
@@ -91,7 +91,7 @@ def prepare_document_metadata(
     
     # Generate document ID from filename
     metadata["document_id"] = hashlib.sha256(
-        metadata["filename"].encode()
+        str(Path(file_path)).encode()
     ).hexdigest()
     
     return metadata
@@ -155,7 +155,7 @@ def rag_index_file(
     client: WeaviateClient|None = None,
     ignore_errors: bool = False,
     distance_metric: VectorDistances = VectorDistances.COSINE,
-):
+) -> str:
     """Index a file using configurable Weaviate Embedded and chunking parameters"""
 
     console = Console()
@@ -186,10 +186,10 @@ def rag_index_file(
         
         if status == "unchanged":
             console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]")
-            return
+            return metadata["document_id"]
         elif status == "duplicate":
             console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]")
-            return
+            return metadata["document_id"]
         elif status == "changed":
             console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]")
             collection.data.delete_many(
@@ -233,7 +233,7 @@ def rag_index_file(
     finally:
         if client and client_created:
             client.close()
-    return "indexed"
+    return metadata["document_id"]
 
 def rag_index_multiple_files(
     file_paths: List[str],
@@ -244,11 +244,13 @@ def rag_index_multiple_files(
     client: WeaviateClient|None = None,
     ignore_errors: bool = False,
     distance_metric: VectorDistances = VectorDistances.COSINE,
-):
+) -> List[str]:
     """Index a file using configurable Weaviate Embedded and chunking parameters"""
 
     console = Console()
     client_created = False
+
+    documents_indexed = []
     try:
         with Status("[bold green]Initializing Weaviate..."):
             if client is None:
@@ -276,9 +278,11 @@ def rag_index_multiple_files(
             
             if status == "unchanged":
                 console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]")
+                documents_indexed.append(metadata["document_id"])
                 continue
             elif status == "duplicate":
                 console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]")
+                documents_indexed.append(metadata["document_id"])
                 continue
             elif status == "changed":
                 console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]")
@@ -318,12 +322,13 @@ def rag_index_multiple_files(
                         },
                         vector=vector
                     )
-                
-        console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}")
+            
+            documents_indexed.append(metadata["document_id"])
+            console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}")
     finally:
         if client and client_created:
             client.close()
-    return "indexed"
+    return documents_indexed
 
 def delete_document_from_index(
     collection: Any,

From 050a654a8b096f5ebf2b89f809d3801c4d382582 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:31:00 -0400
Subject: [PATCH 2/8] yaml fix

---
 .github/workflows/pr-summary-agent.yml | 2 +-
 pr_agent/PR_agent.py                   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index a1f85441..c7352429 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -40,7 +40,7 @@ jobs:
           REPO_OWNER: supercog-ai
           REPO_NAME: PR_code_review-agent
       
-       -name: Update weaviate cache
+      - name: Update weaviate cache
         uses: actions/upload-artifact@v4
         with:
           name: weaviate
diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
index 68bcad58..1598e69b 100644
--- a/pr_agent/PR_agent.py
+++ b/pr_agent/PR_agent.py
@@ -219,8 +219,8 @@ def next_turn(
 pr_review_agent = PRReviewAgent()
 
 if __name__ == "__main__":
-    # Change to PRChanges.patch for deployment
-    with open("PRChangesTest.patch", "r") as f:
+    # Change to PRChangesTest.patch for testing
+    with open("PRChanges.patch", "r") as f:
         patch_content = f.read()
     
     # Run the agent

From e391397d8a3b153f86781861664b9c2ee8eb2e31 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:38:44 -0400
Subject: [PATCH 3/8] yaml fix

---
 .github/workflows/pr-summary-agent.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index c7352429..f7df5029 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -23,7 +23,7 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: weaviate
-          path: ${{ HOME }}/.cache/weaviate
+          path: /home/.cache/weaviate
 
       - name: Run agent
         run: |
@@ -44,4 +44,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: weaviate
-          path: ${{ HOME }}/.cache/weaviate
+          path: /home/.cache/weaviate

From 9329b417130ed5e0895f95fbdc003ef710a89722 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:46:17 -0400
Subject: [PATCH 4/8] 	modified:   .github/workflows/pr-summary-agent.yml

---
 .github/workflows/pr-summary-agent.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index f7df5029..39721882 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -21,9 +21,6 @@ jobs:
 
       - name: Download a single artifact
         uses: actions/download-artifact@v4
-        with:
-          name: weaviate
-          path: /home/.cache/weaviate
 
       - name: Run agent
         run: |

From 17286d2bb574c555fe3da4fa4ce26830521e5814 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:50:11 -0400
Subject: [PATCH 5/8] fix

---
 pr_agent/PR_agent.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
index c3fd1cc1..3fdc7a31 100644
--- a/pr_agent/PR_agent.py
+++ b/pr_agent/PR_agent.py
@@ -8,6 +8,9 @@
 from git_grep_agent import GitGrepAgent
 from summary_agent import SummaryAgent
 from pydantic import BaseModel
+from typing import Dict, List, Any, Generator, Optional, Tuple
+from agentic.common import Agent, AgentRunner, ThreadContext
+from agentic.events import Event, ChatOutput, TurnEnd, PromptStarted, Prompt
 
 load_dotenv()
 

From efe387cfd87511f142799ee71fae554c2fa9dead Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:54:46 -0400
Subject: [PATCH 6/8] Revert "yaml fix"

This reverts commit e391397d8a3b153f86781861664b9c2ee8eb2e31.
---
 .github/workflows/pr-summary-agent.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index 39721882..c7f6ca36 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -41,4 +41,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: weaviate
-          path: /home/.cache/weaviate
+          path: ${{ HOME }}/.cache/weaviate

From 0fa0ea2a9ae0dc957f4070baf4a74717767ed317 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:57:38 -0400
Subject: [PATCH 7/8] yaml

---
 .github/workflows/pr-summary-agent.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index c7f6ca36..ac8cb038 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -41,4 +41,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: weaviate
-          path: ${{ HOME }}/.cache/weaviate
+          path: home/.cache/weaviate

From 74b1eac05952953f476b955465bb001d54d37182 Mon Sep 17 00:00:00 2001
From: Brandon Li <brandonw.li1207@gmail.com>
Date: Thu, 7 Aug 2025 04:58:52 -0400
Subject: [PATCH 8/8] yaml

---
 pr_agent/PR_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
index 3fdc7a31..3867aeff 100644
--- a/pr_agent/PR_agent.py
+++ b/pr_agent/PR_agent.py
@@ -217,4 +217,4 @@ def next_turn(
         patch_content = f.read()
     
     # Run the agent
-    print(pr_review_agent.generate(patch_content))
\ No newline at end of file
+    print(pr_review_agent.final_result(patch_content))
\ No newline at end of file