From f20848ccb6f7dabc3c743c9ff894580956fec311 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:27:20 -0400 Subject: [PATCH 1/8] integrated git grep to work alongside rag, plus several new RAG functionalities --- .github/workflows/pr-summary-agent.yml | 14 +- PRChangesTest.patch | 348 +++++++++++++++++++++++++ pr_agent/PR_agent.py | 59 ++--- pr_agent/code_rag_agent.py | 2 +- pr_agent/code_rag_agent.txt | 88 ------- pr_agent/git_grep_agent.py | 52 +--- pr_agent/rag_sub_agent.py | 0 src/agentic/tools/rag_tool.py | 41 ++- src/agentic/utils/rag_helper.py | 23 +- 9 files changed, 452 insertions(+), 175 deletions(-) create mode 100644 PRChangesTest.patch delete mode 100644 pr_agent/code_rag_agent.txt delete mode 100644 pr_agent/rag_sub_agent.py diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml index 1fe312bf..a1f85441 100644 --- a/.github/workflows/pr-summary-agent.yml +++ b/.github/workflows/pr-summary-agent.yml @@ -18,7 +18,13 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v6 - + + - name: Download a single artifact + uses: actions/download-artifact@v4 + with: + name: weaviate + path: ${{ HOME }}/.cache/weaviate + - name: Run agent run: | uv venv --python 3.12 @@ -33,3 +39,9 @@ jobs: GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} REPO_OWNER: supercog-ai REPO_NAME: PR_code_review-agent + + -name: Update weaviate cache + uses: actions/upload-artifact@v4 + with: + name: weaviate + path: ${{ HOME }}/.cache/weaviate diff --git a/PRChangesTest.patch b/PRChangesTest.patch new file mode 100644 index 00000000..80fe4bfc --- /dev/null +++ b/PRChangesTest.patch @@ -0,0 +1,348 @@ +diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml +index 88e50da..1fe312b 100644 +--- a/.github/workflows/pr-summary-agent.yml ++++ b/.github/workflows/pr-summary-agent.yml +@@ -25,10 +25,8 @@ jobs: + uv pip install -e "../PR_code_review-agent[all,dev]" --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-first-match + git diff --merge-base HEAD^1 HEAD > PRChanges.patch + cat PRChanges.patch +- uv run pr_agent/test_files/mock_pr_agent.py +- +-#uv run pr_agent/PR_agent.py +- ++ uv run pr_agent/PR_agent.py ++ + env: + OPENAI_API_KEY: ${{ secrets.PRAgentOpenAIKey }} + PR_ID: ${{ github.event.pull_request.number }} +diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py +index 46b8f1f..c99d863 100644 +--- a/pr_agent/PR_agent.py ++++ b/pr_agent/PR_agent.py +@@ -29,14 +29,6 @@ class SearchResult(BaseModel): + similarity_score: float = Field( + desciption="Similarity score returned from vector search." + ) +- is_relevant: bool = Field( +- default = True, +- description="Boolean describing if the search result is relevant to the query." +- ) +- relevance_reason: str = Field( +- default = "", +- description="Boolean describing if the search result is relevant to the query." +- ) + included_defs: List[str] = Field( + default_factory=list, + desciption="Similarity score returned from vector search." +@@ -49,7 +41,6 @@ class Searches(BaseModel): + + class RelevanceResult(BaseModel): + relevant: bool +- reason: str + + class PRReviewAgent(Agent): + +@@ -81,7 +72,7 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a + + self.relevanceAgent = Agent( + name="Code Relevange Agent", +- instructions="""You are an expert in determining if a snippet of code or documentation is needed to determine the purpose of a code change from the patch file. Your response must include a 'relevant' field boolean and a 'reason' field with a brief explanation.""", ++ instructions="""You are an expert in determining if a snippet of code or documentation is directly relevant to a query. Your response must include a 'relevant' field boolean.""", + model=GPT_4O_MINI, + result_model=RelevanceResult, + ) +@@ -142,10 +133,10 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a + } + ) + +- print("quer"+str(queries)) ++ print("queries: "+str(queries)) + +- all_results = [] +- ++ # RAG queries ++ all_results = {} + for query in queries.searches[:10]: + searchResponse = yield from self.code_rag_agent.final_result( + f"Search codebase", +@@ -156,35 +147,34 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a + ) + + # Process each result +- for result in searchResponse.sections: +- all_results.append(SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)) ++ for key, result in searchResponse.sections.items(): ++ if not key in all_results: ++ all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs) + +- print("fil"+str(all_results)) ++ print("all: "+str(all_results)) + + # Filter search results using LLM-based relevance checking + filtered_results = [] +- +- for result in all_results: +- if result.similarity_score < 0.5: +- continue +- +- relevance_check = yield from self.relevanceAgent.final_result( +- f"\n{request_context.get("patch_content")}\n\n\n{result.content}{result.query}" +- ) +- +- print(relevance_check) +- +- result.is_relevant = relevance_check.relevant +- result.relevance_reason = relevance_check.reason ++ for result in all_results.values(): + +- if result.is_relevant: +- filtered_results.append(result) ++ try: ++ relevance_check = yield from self.relevanceAgent.final_result( ++ f"\n{request_context.get("patch_content")}\n\n\n{result.content}{result.query}" ++ ) ++ ++ if relevance_check.relevant: ++ filtered_results.append(result) ++ except Exception as e: ++ # LLM error ++ print(e) + +- print(str(filtered_results)) ++ print("filtered: ",str(filtered_results)) + + # Prepare for summary + formatted_str = self.prepare_summary(request_context.get("patch_content"),filtered_results) + ++ print(formatted_str) ++ + summary = yield from self.summaryAgent.final_result( + formatted_str + ) +diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py +index b7f36df..32d0a34 100644 +--- a/pr_agent/code_rag_agent.py ++++ b/pr_agent/code_rag_agent.py +@@ -21,7 +21,7 @@ class CodeSection(BaseModel): + ) + + class CodeSections(BaseModel): +- sections: List[CodeSection] = Field( ++ sections: dict[str,CodeSection] = Field( + description="Sections of the codebase returned from the search.", + ) + search_query: str = Field( +@@ -46,7 +46,8 @@ class CodeRagAgent(Agent): + + self.ragTool = RAGTool( + default_index="codebase", +- index_paths=["../*.md","../*.py"], ++ index_paths=["../**/*.py","../**/*.md"], ++ recursive=True + ) + + +@@ -65,24 +66,28 @@ class CodeRagAgent(Agent): + searchQuery = request_context.get("query") + + searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5) +- +- allSections = CodeSections(sections=[],search_query=query) ++ ++ allSections = CodeSections(sections={},search_query=query) + + for nextResult in searchResult: +- print(nextResult) + file_path = nextResult["source_url"] +- similarity_score = nextResult["distance"] if nextResult["distance"] else 0 +- content = nextResult["content"] ++ if not file_path in allSections.sections: ++ #print(nextResult) ++ ++ similarity_score = nextResult["distance"] if nextResult["distance"] else 0 ++ content = nextResult["content"] + +- # Only works with Python files +- included_defs = [] +- try: +- with open(file_path) as file: +- node = ast.parse(file.read()) +- included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)] +- except: ++ # Only works with Python files + included_defs = [] ++ try: ++ with open(file_path) as file: ++ node = ast.parse(file.read()) ++ included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)] ++ except: ++ included_defs = [] + +- allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score)) ++ allSections.sections[file_path] = CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score) ++ #else: ++ #print("Skipping Duplicate: ",file_path) + + yield TurnEnd(self.name, [{"content": allSections}]) +diff --git a/src/agentic/events.py b/src/agentic/events.py +index 01aa68e..da12979 100644 +--- a/src/agentic/events.py ++++ b/src/agentic/events.py +@@ -656,7 +656,6 @@ class TurnEnd(Event): + def result(self): + """Safe result access with fallback""" + try: +- print(self.agent,self.messages) + return self.messages[-1]["content"] if self.messages else "No response generated" + except (IndexError, KeyError): + return "Error: Malformed response" +diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py +index e3e7280..9891e7c 100644 +--- a/src/agentic/tools/rag_tool.py ++++ b/src/agentic/tools/rag_tool.py +@@ -14,6 +14,7 @@ from agentic.utils.rag_helper import ( + init_embedding_model, + init_chunker, + rag_index_file, ++ rag_index_multiple_files, + ) + + from agentic.utils.summarizer import generate_document_summary +@@ -44,6 +45,8 @@ class RAGTool(BaseAgenticTool): + # Construct the RAG tool. You can pass a list of files and we will ensure that + # they are added to the index on startup. Paths can include glob patterns also, + # like './docs/*.md'. ++ # Enable recursive (**.md) glob patterns with recursive = True ++ + self.default_index = default_index + self.index_paths = index_paths + if self.index_paths: +@@ -51,8 +54,11 @@ class RAGTool(BaseAgenticTool): + if default_index not in list_collections(client): + create_collection(client, default_index, VectorDistances.COSINE) + for path in index_paths: +- for file_path in [path] if path.startswith("http") else glob.glob(path, recursive=recursive): +- rag_index_file(file_path, self.default_index, client=client, ignore_errors=True) ++ if path.startswith("http"): ++ rag_index_file(path, self.default_index, client=client, ignore_errors=True) ++ else: ++ file_paths = glob.glob(path, recursive=recursive) ++ rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True) + + def get_tools(self) -> List[Callable]: + return [ +diff --git a/src/agentic/utils/file_reader.py b/src/agentic/utils/file_reader.py +index 3ea5ae2..70671fb 100644 +--- a/src/agentic/utils/file_reader.py ++++ b/src/agentic/utils/file_reader.py +@@ -57,6 +57,9 @@ def read_file(file_path: str, mime_type: str|None = None) -> tuple[str, str]: + return text, mime_type + elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": + return pd.read_excel(file_path).to_csv(), mime_type ++ elif mime_type == "text/x-python": ++ with open(file_path,"r") as f: ++ return f.read(), mime_type + else: + return textract.process(file_path).decode('utf-8'), mime_type + except Exception as e: +diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py +index 94dd7f5..1d585f2 100644 +--- a/src/agentic/utils/rag_helper.py ++++ b/src/agentic/utils/rag_helper.py +@@ -234,7 +234,96 @@ def rag_index_file( + if client and client_created: + client.close() + return "indexed" ++ ++def rag_index_multiple_files( ++ file_paths: List[str], ++ index_name: str, ++ chunk_threshold: float = 0.5, ++ chunk_delimiters: str = ". ,! ,? ,\n", ++ embedding_model: str = "BAAI/bge-small-en-v1.5", ++ client: WeaviateClient|None = None, ++ ignore_errors: bool = False, ++ distance_metric: VectorDistances = VectorDistances.COSINE, ++): ++ """Index a file using configurable Weaviate Embedded and chunking parameters""" ++ ++ console = Console() ++ client_created = False ++ try: ++ with Status("[bold green]Initializing Weaviate..."): ++ if client is None: ++ client = init_weaviate() ++ client_created = True ++ create_collection(client, index_name, distance_metric) ++ ++ with Status("[bold green]Initializing models..."): ++ embed_model = init_embedding_model(embedding_model) ++ chunker = init_chunker(chunk_threshold, chunk_delimiters) + ++ for file_path in file_paths: ++ with Status(f"[bold green]Processing {file_path}...", console=console): ++ text, mime_type = read_file(str(file_path)) ++ metadata = prepare_document_metadata(file_path, text, mime_type, GPT_DEFAULT_MODEL) ++ ++ console.print(f"[bold green]Indexing {file_path}...") ++ ++ collection = client.collections.get(index_name) ++ exists, status = check_document_exists( ++ collection, ++ metadata["document_id"], ++ metadata["fingerprint"] ++ ) ++ ++ if status == "unchanged": ++ console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]") ++ continue ++ elif status == "duplicate": ++ console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]") ++ continue ++ elif status == "changed": ++ console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]") ++ collection.data.delete_many( ++ where=Filter.by_property("document_id").equal(metadata["document_id"]) ++ ) ++ ++ with Status("[bold green]Generating document summary...", console=console): ++ metadata["summary"] = generate_document_summary( ++ text=text[:12000], ++ mime_type=mime_type, ++ model=GPT_DEFAULT_MODEL ++ ) ++ ++ chunks = chunker(text) ++ chunks_text = [chunk.text for chunk in chunks] ++ if not chunks_text: ++ if ignore_errors: ++ return client ++ raise ValueError("No text chunks generated from document") ++ ++ batch_size = 128 ++ embeddings = [] ++ with Status("[bold green]Generating embeddings..."): ++ for i in range(0, len(chunks_text), batch_size): ++ batch = chunks_text[i:i+batch_size] ++ embeddings.extend(list(embed_model.embed(batch))) ++ ++ with Status("[bold green]Indexing chunks..."), collection.batch.dynamic() as batch: ++ for i, chunk in enumerate(chunks): ++ vector = embeddings[i].tolist() ++ batch.add_object( ++ properties={ ++ **metadata, ++ "content": chunk.text, ++ "chunk_index": i, ++ }, ++ vector=vector ++ ) ++ ++ console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}") ++ finally: ++ if client and client_created: ++ client.close() ++ return "indexed" + + def delete_document_from_index( + collection: Any, diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py index 2f04d14b..68bcad58 100644 --- a/pr_agent/PR_agent.py +++ b/pr_agent/PR_agent.py @@ -137,34 +137,7 @@ def next_turn( print("queries: "+str(queries)) - - - # # Git-Grep queries - # all_results = {} - # for query in queries.searches[:10]: - # searchResponse = yield from self.git_grep_agent.final_result( - # f"Search codebase with git grep", - # request_context={ - # "query": query, - # "thread_id": request_context.get("thread_id") - # } - # ) - - # # Process each result - # # grep_response.sections is a list of CodeSection objects - # for result in searchResponse.sections: - # if result.file_path not in all_results: - # all_results[result.file_path] = SearchResult( - # query=query, - # file_path=result.file_path, - # content=result.search_result, - # similarity_score=result.similarity_score, - # included_defs=result.included_defs - # ) - - - - # RAG queries + # RAG and Git-Grep queries all_results = {} for query in queries.searches[:10]: searchResponse = yield from self.code_rag_agent.final_result( @@ -176,9 +149,30 @@ def next_turn( ) # Process each result - for key, result in searchResponse.sections.items(): - if not key in all_results: - all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs) + for file, result in searchResponse.sections.items(): + if not file in all_results: + all_results[file] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs) + + searchResponse = yield from self.git_grep_agent.final_result( + f"Search codebase with git grep", + request_context={ + "query": query, + "thread_id": request_context.get("thread_id") + } + ) + + # Process each result + # grep_response.sections is a list of CodeSection objects + for file, result in searchResponse.sections.items(): + if not file in all_results: + all_results[file] = SearchResult( + query=query, + file_path=result.file_path, + content=result.search_result, + similarity_score=result.similarity_score, + included_defs=result.included_defs + ) + print("all: "+str(all_results)) @@ -225,7 +219,8 @@ def next_turn( pr_review_agent = PRReviewAgent() if __name__ == "__main__": - with open("PRChanges.patch", "r") as f: + # Change to PRChanges.patch for deployment + with open("PRChangesTest.patch", "r") as f: patch_content = f.read() # Run the agent diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py index 32d0a346..474550ef 100644 --- a/pr_agent/code_rag_agent.py +++ b/pr_agent/code_rag_agent.py @@ -46,7 +46,7 @@ def __init__(self, self.ragTool = RAGTool( default_index="codebase", - index_paths=["../**/*.py","../**/*.md"], + index_paths=["../**/*.md"], recursive=True ) diff --git a/pr_agent/code_rag_agent.txt b/pr_agent/code_rag_agent.txt deleted file mode 100644 index b7f36df7..00000000 --- a/pr_agent/code_rag_agent.txt +++ /dev/null @@ -1,88 +0,0 @@ -from typing import Any, Generator, List -from agentic.common import Agent, AgentRunner, ThreadContext -from agentic.events import Event, ChatOutput, WaitForInput, Prompt, PromptStarted, TurnEnd, ResumeWithInput -from agentic.models import GPT_4O_MINI # model (using GPT for testing) -from pydantic import BaseModel, Field -from agentic.tools.rag_tool import RAGTool -import ast - -class CodeSection(BaseModel): - search_result: str = Field( - description="Part returned from search.", - ) - file_path: str = Field( - description="Path of the file this code belongs to." - ) - included_defs: list[str] = Field( - description="Classes and functions defined in this file." - ) - similarity_score: float = Field( - desciption="Similarity score returned from vector search." - ) - -class CodeSections(BaseModel): - sections: List[CodeSection] = Field( - description="Sections of the codebase returned from the search.", - ) - search_query: str = Field( - description="Query used to return this section.", - ) - -class CodeRagAgent(Agent): - def __init__(self, - name="Code Rag Agent", - welcome="I am the Code Rag Agent. Please give me a search query (function name,class name, etc.) and I'll return relevant parts of the code.", - model: str=GPT_4O_MINI, - result_model = CodeSections, - **kwargs - ): - super().__init__( - name=name, - welcome=welcome, - model=model, - result_model=result_model, - **kwargs - ) - - self.ragTool = RAGTool( - default_index="codebase", - index_paths=["../*.md","../*.py"], - ) - - - def next_turn( - self, - request: str|Prompt, - request_context: dict = {}, - request_id: str = None, - continue_result: dict = {}, - debug = "", - ) -> Generator[Event, Any, Any]: - - query = request.payload if isinstance(request, Prompt) else request - yield PromptStarted(query, {"query": query}) - - searchQuery = request_context.get("query") - - searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5) - - allSections = CodeSections(sections=[],search_query=query) - - for nextResult in searchResult: - print(nextResult) - file_path = nextResult["source_url"] - similarity_score = nextResult["distance"] if nextResult["distance"] else 0 - content = nextResult["content"] - - # Only works with Python files - included_defs = [] - try: - with open(file_path) as file: - node = ast.parse(file.read()) - included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)] - except: - included_defs = [] - - allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score)) - - yield TurnEnd(self.name, [{"content": allSections}]) diff --git a/pr_agent/git_grep_agent.py b/pr_agent/git_grep_agent.py index a766924a..7d0ee0f8 100644 --- a/pr_agent/git_grep_agent.py +++ b/pr_agent/git_grep_agent.py @@ -7,37 +7,7 @@ import ast import os - -# Defines strcutured data containers for the serach/query results -# each CodeSection object will represent one match from a git grep search -class CodeSection(BaseModel): - search_result: str = Field( - description="Matching line returned from git grep.", - ) - file_path: str = Field( - description="Path of the file containing the match ." - ) - included_defs: list[str] = Field( - description="Classes and functions defined in this file." - ) - # consider making the similarity_score optional, since grep doesn't return a similarity score - similarity_score: float = Field( - description="Similarity score placeholder for git grep, default to 1.0" - ) - - - -# Represents the collection of matches for one serach query -class CodeSections(BaseModel): - # list of CodeSection objects - sections: List[CodeSection] = Field( - description="Sections of the codebase returned from the git grep search.", - ) - # This is the query used for git grep - search_query: str = Field( - description="Query used to return this section.", - ) - +from code_rag_agent import CodeSection, CodeSections # The actual sub-agent that runs git grep and returns structured results class GitGrepAgent(Agent): @@ -111,11 +81,11 @@ def next_turn( # TODO: verify that sections doesn't have to be a dictionary instead (like code_rag_agent implementation) - allSections = CodeSections(sections=[], search_query=search_query) # creates an empty CodeSections object + allSections = CodeSections(sections={}, search_query=search_query) # creates an empty CodeSections object # loops over each grep match for file_path, matched_line in grep_results: - if file_path not in allSections.sections: + if not file_path in allSections.sections: included_defs = [] try: if file_path.endswith(".py"): # if a python file, parse the AST, and collect all function/class names @@ -125,17 +95,17 @@ def next_turn( n.name for n in node.body if isinstance (n, ast.ClassDef) or isinstance(n, ast.FunctionDef) ] + else: + continue # ONLY search for .py files except: included_defs = [] - # Only add if this file_path hasn’t already been added - if not any(sec.file_path == file_path for sec in allSections.sections): - allSections.sections.append(CodeSection( - search_result=matched_line, - file_path=file_path, - included_defs=included_defs, - similarity_score=1.0 # grep doesn't do semantic scoring - )) + allSections.sections[file_path] = CodeSection( + search_result=matched_line, + file_path=file_path, + included_defs=included_defs, + similarity_score=1.0 # grep doesn't do semantic scoring + ) yield TurnEnd(self.name, [{"content": allSections}]) diff --git a/pr_agent/rag_sub_agent.py b/pr_agent/rag_sub_agent.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py index 9891e7c9..b42bea28 100644 --- a/src/agentic/tools/rag_tool.py +++ b/src/agentic/tools/rag_tool.py @@ -15,6 +15,7 @@ init_chunker, rag_index_file, rag_index_multiple_files, + delete_document_from_index, ) from agentic.utils.summarizer import generate_document_summary @@ -23,6 +24,7 @@ from weaviate.collections.classes.grpc import Sort from weaviate.classes.config import VectorDistances +from rich.console import Console @tool_registry.register( name="RAGTool", @@ -41,6 +43,7 @@ def __init__( default_index: str = "knowledge_base", index_paths: list[str] = [], recursive: bool = False, + overwrite_index = True, ): # Construct the RAG tool. You can pass a list of files and we will ensure that # they are added to the index on startup. Paths can include glob patterns also, @@ -53,12 +56,44 @@ def __init__( client = init_weaviate() if default_index not in list_collections(client): create_collection(client, default_index, VectorDistances.COSINE) + + # Keep track of files found during initialization + if overwrite_index: + indexed_documents = {} + for path in index_paths: if path.startswith("http"): - rag_index_file(path, self.default_index, client=client, ignore_errors=True) + document_id = rag_index_file(path, self.default_index, client=client, ignore_errors=True) + + if overwrite_index: + indexed_documents[document_id] = True + else: file_paths = glob.glob(path, recursive=recursive) - rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True) + document_ids = rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True) + + if overwrite_index: + for document_id in document_ids: + indexed_documents[document_id] = True + + # Delete indexed files not found during initialization + if overwrite_index: + try: + console = Console() + collection = client.collections.get(self.default_index) + documents = list_documents_in_collection(collection) + + for document in documents: + if not document["document_id"] in indexed_documents: + console.print(f"[bold green]✅ Removing deleted file {document["filename"]} from index") + delete_document_from_index(collection=collection,document_id=document["document_id"],filename=document["filename"]) + + except Exception as e: + print(f"Error listing documents: {str(e)}") + return + finally: + if client: + client.close() def get_tools(self) -> List[Callable]: return [ @@ -66,7 +101,7 @@ def get_tools(self) -> List[Callable]: #self.list_indexes, self.search_knowledge_index, self.list_documents, - self.review_full_document + self.review_full_document, ] def save_content_to_knowledge_index( diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py index 1d585f2d..89a1f390 100644 --- a/src/agentic/utils/rag_helper.py +++ b/src/agentic/utils/rag_helper.py @@ -91,7 +91,7 @@ def prepare_document_metadata( # Generate document ID from filename metadata["document_id"] = hashlib.sha256( - metadata["filename"].encode() + str(Path(file_path)).encode() ).hexdigest() return metadata @@ -155,7 +155,7 @@ def rag_index_file( client: WeaviateClient|None = None, ignore_errors: bool = False, distance_metric: VectorDistances = VectorDistances.COSINE, -): +) -> str: """Index a file using configurable Weaviate Embedded and chunking parameters""" console = Console() @@ -186,10 +186,10 @@ def rag_index_file( if status == "unchanged": console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]") - return + return metadata["document_id"] elif status == "duplicate": console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]") - return + return metadata["document_id"] elif status == "changed": console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]") collection.data.delete_many( @@ -233,7 +233,7 @@ def rag_index_file( finally: if client and client_created: client.close() - return "indexed" + return metadata["document_id"] def rag_index_multiple_files( file_paths: List[str], @@ -244,11 +244,13 @@ def rag_index_multiple_files( client: WeaviateClient|None = None, ignore_errors: bool = False, distance_metric: VectorDistances = VectorDistances.COSINE, -): +) -> List[str]: """Index a file using configurable Weaviate Embedded and chunking parameters""" console = Console() client_created = False + + documents_indexed = [] try: with Status("[bold green]Initializing Weaviate..."): if client is None: @@ -276,9 +278,11 @@ def rag_index_multiple_files( if status == "unchanged": console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]") + documents_indexed.append(metadata["document_id"]) continue elif status == "duplicate": console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]") + documents_indexed.append(metadata["document_id"]) continue elif status == "changed": console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]") @@ -318,12 +322,13 @@ def rag_index_multiple_files( }, vector=vector ) - - console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}") + + documents_indexed.append(metadata["document_id"]) + console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}") finally: if client and client_created: client.close() - return "indexed" + return documents_indexed def delete_document_from_index( collection: Any, From 050a654a8b096f5ebf2b89f809d3801c4d382582 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:31:00 -0400 Subject: [PATCH 2/8] yaml fix --- .github/workflows/pr-summary-agent.yml | 2 +- pr_agent/PR_agent.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml index a1f85441..c7352429 100644 --- a/.github/workflows/pr-summary-agent.yml +++ b/.github/workflows/pr-summary-agent.yml @@ -40,7 +40,7 @@ jobs: REPO_OWNER: supercog-ai REPO_NAME: PR_code_review-agent - -name: Update weaviate cache + - name: Update weaviate cache uses: actions/upload-artifact@v4 with: name: weaviate diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py index 68bcad58..1598e69b 100644 --- a/pr_agent/PR_agent.py +++ b/pr_agent/PR_agent.py @@ -219,8 +219,8 @@ def next_turn( pr_review_agent = PRReviewAgent() if __name__ == "__main__": - # Change to PRChanges.patch for deployment - with open("PRChangesTest.patch", "r") as f: + # Change to PRChangesTest.patch for testing + with open("PRChanges.patch", "r") as f: patch_content = f.read() # Run the agent From e391397d8a3b153f86781861664b9c2ee8eb2e31 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:38:44 -0400 Subject: [PATCH 3/8] yaml fix --- .github/workflows/pr-summary-agent.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml index c7352429..f7df5029 100644 --- a/.github/workflows/pr-summary-agent.yml +++ b/.github/workflows/pr-summary-agent.yml @@ -23,7 +23,7 @@ jobs: uses: actions/download-artifact@v4 with: name: weaviate - path: ${{ HOME }}/.cache/weaviate + path: /home/.cache/weaviate - name: Run agent run: | @@ -44,4 +44,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: weaviate - path: ${{ HOME }}/.cache/weaviate + path: /home/.cache/weaviate From 9329b417130ed5e0895f95fbdc003ef710a89722 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:46:17 -0400 Subject: [PATCH 4/8] modified: .github/workflows/pr-summary-agent.yml --- .github/workflows/pr-summary-agent.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml index f7df5029..39721882 100644 --- a/.github/workflows/pr-summary-agent.yml +++ b/.github/workflows/pr-summary-agent.yml @@ -21,9 +21,6 @@ jobs: - name: Download a single artifact uses: actions/download-artifact@v4 - with: - name: weaviate - path: /home/.cache/weaviate - name: Run agent run: | From 17286d2bb574c555fe3da4fa4ce26830521e5814 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:50:11 -0400 Subject: [PATCH 5/8] fix --- pr_agent/PR_agent.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py index c3fd1cc1..3fdc7a31 100644 --- a/pr_agent/PR_agent.py +++ b/pr_agent/PR_agent.py @@ -8,6 +8,9 @@ from git_grep_agent import GitGrepAgent from summary_agent import SummaryAgent from pydantic import BaseModel +from typing import Dict, List, Any, Generator, Optional, Tuple +from agentic.common import Agent, AgentRunner, ThreadContext +from agentic.events import Event, ChatOutput, TurnEnd, PromptStarted, Prompt load_dotenv() From efe387cfd87511f142799ee71fae554c2fa9dead Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:54:46 -0400 Subject: [PATCH 6/8] Revert "yaml fix" This reverts commit e391397d8a3b153f86781861664b9c2ee8eb2e31. --- .github/workflows/pr-summary-agent.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml index 39721882..c7f6ca36 100644 --- a/.github/workflows/pr-summary-agent.yml +++ b/.github/workflows/pr-summary-agent.yml @@ -41,4 +41,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: weaviate - path: /home/.cache/weaviate + path: ${{ HOME }}/.cache/weaviate From 0fa0ea2a9ae0dc957f4070baf4a74717767ed317 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:57:38 -0400 Subject: [PATCH 7/8] yaml --- .github/workflows/pr-summary-agent.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml index c7f6ca36..ac8cb038 100644 --- a/.github/workflows/pr-summary-agent.yml +++ b/.github/workflows/pr-summary-agent.yml @@ -41,4 +41,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: weaviate - path: ${{ HOME }}/.cache/weaviate + path: home/.cache/weaviate From 74b1eac05952953f476b955465bb001d54d37182 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 7 Aug 2025 04:58:52 -0400 Subject: [PATCH 8/8] yaml --- pr_agent/PR_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py index 3fdc7a31..3867aeff 100644 --- a/pr_agent/PR_agent.py +++ b/pr_agent/PR_agent.py @@ -217,4 +217,4 @@ def next_turn( patch_content = f.read() # Run the agent - print(pr_review_agent.generate(patch_content)) \ No newline at end of file + print(pr_review_agent.final_result(patch_content)) \ No newline at end of file