From 7cd2f14e146ac34e939671703e3304bcd0b468f2 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Tue, 12 Aug 2025 18:35:30 -0400 Subject: [PATCH 1/2] test --- test | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test diff --git a/test b/test new file mode 100644 index 00000000..e69de29b From 7974fb96486f0ce1151b049bc140fcc2ab589259 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Thu, 14 Aug 2025 17:40:52 -0400 Subject: [PATCH 2/2] Patch test now includes all changes from the PR agent fork --- PRChangesTest.patch | 902 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 720 insertions(+), 182 deletions(-) diff --git a/PRChangesTest.patch b/PRChangesTest.patch index 55eb5bd0..beaf620e 100644 --- a/PRChangesTest.patch +++ b/PRChangesTest.patch @@ -1,214 +1,670 @@ diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml -index 88e50da..1fe312b 100644 ---- a/.github/workflows/pr-summary-agent.yml +new file mode 100644 +index 0000000..dcb5428 +--- /dev/null +++ b/.github/workflows/pr-summary-agent.yml -@@ -25,10 +25,8 @@ jobs: - uv pip install -e "../PR_code_review-agent[all,dev]" --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-first-match - git diff --merge-base HEAD^1 HEAD > PRChanges.patch - cat PRChanges.patch -- uv run pr_agent/test_files/mock_pr_agent.py -- --#uv run pr_agent/PR_agent.py -- +@@ -0,0 +1,52 @@ ++name: PR Summary Agent ++ ++on: ++ pull_request: ++ types: [opened, synchronize, reopened] ++ branches: ++ - main ++jobs: ++ run-pr-agent: ++ permissions: write-all ++ name: run PR Summary Agent ++ runs-on: ubuntu-latest ++ steps: ++ - name: Check out the repository to the runner ++ uses: actions/checkout@v4 ++ with: ++ fetch-depth: 2 ++ ++ - name: Install uv ++ uses: astral-sh/setup-uv@v6 ++ ++ - name: Download weaviate cache ++ id: download-artifact ++ uses: dawidd6/action-download-artifact@v11 ++ with: ++ github_token: ${{secrets.GITHUB_TOKEN}} ++ workflow: pr-summary-agent.yml ++ name: weaviate ++ if_no_artifact_found: warn ++ path: /home/runner/.cache/weaviate ++ ++ - name: Run agent ++ run: | ++ uv venv --python 3.12 ++ uv pip install -e "../${{ github.event.repository.name }}[all,dev]" --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-first-match ++ uv pip install litellm[proxy] ++ git diff --merge-base HEAD^1 HEAD > PRChanges.patch ++ cat PRChanges.patch + uv run pr_agent/PR_agent.py + - env: - OPENAI_API_KEY: ${{ secrets.PRAgentOpenAIKey }} - PR_ID: ${{ github.event.pull_request.number }} ++ env: ++ OPENAI_API_KEY: ${{ secrets.PRAgentOpenAIKey }} ++ PR_ID: ${{ github.event.pull_request.number }} ++ GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} ++ REPO: ${{ github.repository }} ++ ++ - name: Update weaviate cache ++ uses: actions/upload-artifact@v4 ++ with: ++ name: weaviate ++ path: /home/runner/.cache/weaviate ++ overwrite: true diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py -index 46b8f1f..c99d863 100644 ---- a/pr_agent/PR_agent.py +new file mode 100644 +index 0000000..5d5abd0 +--- /dev/null +++ b/pr_agent/PR_agent.py -@@ -29,14 +29,6 @@ class SearchResult(BaseModel): - similarity_score: float = Field( - desciption="Similarity score returned from vector search." - ) -- is_relevant: bool = Field( -- default = True, -- description="Boolean describing if the search result is relevant to the query." -- ) -- relevance_reason: str = Field( -- default = "", -- description="Boolean describing if the search result is relevant to the query." -- ) - included_defs: List[str] = Field( - default_factory=list, - desciption="Similarity score returned from vector search." -@@ -49,7 +41,6 @@ class Searches(BaseModel): - - class RelevanceResult(BaseModel): - relevant: bool -- reason: str - - class PRReviewAgent(Agent): - -@@ -81,7 +72,7 @@ You t in determining if a snippet of code or documentation is needed to determine the purpose of a code change from the patch file. Your response must include a 'relevant' field boolean and a 'reason' field with a brief explanation.""", -+ instructions="""You are an expertare an expert in generating NON-NATURAL LANGUAGE CODE search queries from a - - self.relevanceAgent = Agent( - name="Code Relevange Agent", -- instructions="""You are an exper in determining if a snippet of code or documentation is directly relevant to a query. Your response must include a 'relevant' field boolean.""", - model=GPT_4O_MINI, - result_model=RelevanceResult, - ) -@@ -142,10 +133,10 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a - } - ) - -- print("quer"+str(queries)) -+ print("queries: "+str(queries)) - -- all_results = [] -- -+ # RAG queries +@@ -0,0 +1,218 @@ ++import os ++from pathlib import Path ++import re ++import json ++import requests ++from typing import Dict, List, Any, Generator, Optional, Tuple ++from pydantic import Field, BaseModel ++from dotenv import load_dotenv ++from agentic.common import Agent, AgentRunner, ThreadContext ++from agentic.events import Event, ChatOutput, TurnEnd, PromptStarted, Prompt ++from agentic.models import GPT_4O_MINI ++from litellm import token_counter ++from code_rag_agent import CodeRagAgent ++from git_grep_agent import GitGrepAgent ++from summary_agent import SummaryAgent ++from code_rag_agent import CodeSection, CodeSections ++from pydantic import BaseModel ++ ++SUMMARY_MODEL = GPT_4O_MINI ++# SUMMARY_MODEL = CLAUDE ++ ++load_dotenv() ++ ++class SearchResult(BaseModel): ++ query: str = Field( ++ description="Query used in this search." ++ ) ++ file_path: str = Field( ++ description="Path of the file this code/documentation belongs to." ++ ) ++ content: str = Field( ++ description="Content returned from search." ++ ) ++ similarity_score: float = Field( ++ desciption="Similarity score returned from vector search." ++ ) ++ included_defs: List[str] = Field( ++ default_factory=list, ++ desciption="Similarity score returned from vector search." ++ ) ++ ++class Searches(BaseModel): ++ searches: List[str] = Field( ++ description="Search queries." ++ ) ++ ++class RelevanceResult(BaseModel): ++ relevant: bool ++ ++class PRReviewAgent(Agent): ++ ++ def __init__( ++ self, ++ name: str = "PR Review Agent", ++ model: str = GPT_4O_MINI, ++ verbose: bool = False, ++ **kwargs ++ ): ++ super().__init__( ++ name=name, ++ welcome="PR Review Agent initialized. Ready to process PRs.", ++ model=model, ++ **kwargs ++ ) ++ self.git_grep_agent = GitGrepAgent() ++ self.code_rag_agent = CodeRagAgent() ++ self.verbose = verbose ++ ++ self.queryAgent = Agent( ++ name="Code Query Agent", ++ instructions= ++""" ++You are an expert in generating code search queries from a patch file to get additional context about changes to a code base. Your response must include a 'searches' field with a list of strings. Example outputs: ["Weather_Tool", "SearchQuery", "format_sections"] ++""", ++ model=GPT_4O_MINI, ++ result_model=Searches, ++ ) ++ ++ self.summaryAgent = SummaryAgent() ++ ++ def prepare_summary(self, patch_content: str, filtered_results: Dict[str,SearchResult]) -> str: ++ ++ """Prepare for summary agent""" ++ formatted_str = "" ++ formatted_str += f"\n" ++ formatted_str += f"{patch_content}\n" ++ formatted_str += f"\n\n" ++ ++ final_str = formatted_str[:] ++ ++ for result in filtered_results.values(): ++ formatted_str += f"<{result.file_path}>\n" ++ formatted_str += f"{result.content}\n" ++ formatted_str += f"\n\n" ++ ++ if token_counter(model=SUMMARY_MODEL, messages=[{"role": "user", "content": {final_str}}]) > 115000: ++ break ++ else: ++ final_str = formatted_str[:] ++ ++ return final_str ++ ++ def post_to_github(self, summary: str) -> str: ++ """Post summary as a GitHub comment""" ++ repo = os.getenv("REPO") ++ pr_id = os.getenv("PR_ID") ++ gh_token = os.getenv("GITHUB_API_KEY") ++ ++ if not all([repo, pr_id, gh_token]): ++ raise ValueError("Missing required GitHub configuration") ++ ++ url = f"https://api.github.com/repos/{repo}/issues/{pr_id}/comments" ++ headers = { ++ "Authorization": f"token {gh_token}", ++ } ++ data = {"body": summary} ++ ++ response = requests.post(url, headers=headers, json=data) ++ response.raise_for_status() ++ return response.json().get("html_url") ++ ++ def next_turn( ++ self, ++ request: str, ++ request_context: dict = None, ++ request_id: str = None, ++ continue_result: dict = {}, ++ debug = "", ++ ) -> Generator[Event, Any, None]: ++ ++ query = request.payload if isinstance(request, Prompt) else request ++ yield PromptStarted(query, {"query": query}) ++ ++ # Generate search queries ++ queries = yield from self.queryAgent.final_result( ++ request_context.get("patch_content"), ++ request_context={ ++ "thread_id": request_context.get("thread_id") ++ } ++ ) ++ ++ print("queries: ", str(queries)) ++ ++ # RAG and Git-Grep queries + all_results = {} - for query in queries.searches[:10]: - searchResponse = yield from self.code_rag_agent.final_result( - f"Search codebase", -@@ -156,35 +147,34 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a - ) - - # Process each result -- for result in searchResponse.sections: -- all_results.append(SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)) -+ for key, result in searchResponse.sections.items(): -+ if not key in all_results: -+ all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs) - -- print("fil"+str(all_results)) -+ print("all: "+str(all_results)) - - # Filter search results using LLM-based relevance checking - filtered_results = [] -- -- for result in all_results: -- if result.similarity_score < 0.5: -- continue -- -- relevance_check = yield from self.relevanceAgent.final_result( -- f"\n{request_context.get("patch_content")}\n\n\n{result.content}{result.query}" -- ) -- -- print(relevance_check) -- -- result.is_relevant = relevance_check.relevant -- result.relevance_reason = relevance_check.reason -+ for result in all_results.values(): - -- if result.is_relevant: -- filtered_results.append(result) -+ try: -+ relevance_check = yield from self.relevanceAgent.final_result( -+ f"\n{request_context.get("patch_content")}\n\n\n{result.content}{result.query}" -+ ) -+ -+ if relevance_check.relevant: -+ filtered_results.append(result) -+ except Exception as e: -+ # LLM error -+ print(e) - -- print(str(filtered_results)) -+ print("filtered: ",str(filtered_results)) - - # Prepare for summary - formatted_str = self.prepare_summary(request_context.get("patch_content"),filtered_results) - ++ for query in queries.searches[:10]: ++ searchResponse = yield from self.code_rag_agent.final_result( ++ "Search codebase", ++ request_context={ ++ "query": query, ++ "thread_id": request_context.get("thread_id") ++ } ++ ) ++ ++ # Process each result ++ for file, result in searchResponse.sections.items(): ++ if file not in all_results: ++ all_results[file] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs) ++ ++ searchResponse = yield from self.git_grep_agent.final_result( ++ "Search codebase with git grep", ++ request_context={ ++ "query": query, ++ "thread_id": request_context.get("thread_id") ++ } ++ ) ++ ++ # Process each result ++ # grep_response.sections is a dict of filepaths and CodeSection objects ++ for file, result in searchResponse.sections.items(): ++ if file not in all_results: ++ all_results[file] = SearchResult( ++ query=query, ++ file_path=result.file_path, ++ content=result.search_result, ++ similarity_score=result.similarity_score, ++ included_defs=result.included_defs ++ ) ++ ++ ++ print("all: ", all_results) ++ ++ # Prepare for summary ++ formatted_str = self.prepare_summary(request_context.get("patch_content"),all_results) ++ + print(formatted_str) + - summary = yield from self.summaryAgent.final_result( - formatted_str - ) ++ summary = yield from self.summaryAgent.final_result( ++ formatted_str ++ ) ++ ++ comment_url = self.post_to_github(summary) ++ ++ # Return the final result ++ yield ChatOutput( ++ self.name, ++ [{"content": f"## PR Review Complete\n\nSummary posted to: {comment_url}"}] ++ ) ++ ++ yield TurnEnd( ++ self.name, ++ [{"content": summary}] ++ ) ++ ++ ++ ++if __name__ == "__main__": ++ ++ # test ++ # Change to PRChangesTest.patch for testing ++ with open("PRChangesTest.patch", "r") as f: ++ patch_content = f.read() ++ ++ # Create an instance of the agent ++ pr_review_agent = PRReviewAgent() ++ ++ # Run the agent ++ print(pr_review_agent.grab_final_result("Triggered by a PR",{"patch_content":patch_content})) +\ No newline at end of file diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py -index b7f36df..32d0a34 100644 ---- a/pr_agent/code_rag_agent.py +new file mode 100644 +index 0000000..474550e +--- /dev/null +++ b/pr_agent/code_rag_agent.py -@@ -21,7 +21,7 @@ class CodeSection(BaseModel): - ) - - class CodeSections(BaseModel): -- sections: List[CodeSection] = Field( +@@ -0,0 +1,93 @@ ++from typing import Any, Generator, List ++from agentic.common import Agent, AgentRunner, ThreadContext ++from agentic.events import Event, ChatOutput, WaitForInput, Prompt, PromptStarted, TurnEnd, ResumeWithInput ++from agentic.models import GPT_4O_MINI # model (using GPT for testing) ++from pydantic import BaseModel, Field ++from agentic.tools.rag_tool import RAGTool ++import ast ++ ++class CodeSection(BaseModel): ++ search_result: str = Field( ++ description="Part returned from search.", ++ ) ++ file_path: str = Field( ++ description="Path of the file this code belongs to." ++ ) ++ included_defs: list[str] = Field( ++ description="Classes and functions defined in this file." ++ ) ++ similarity_score: float = Field( ++ desciption="Similarity score returned from vector search." ++ ) ++ ++class CodeSections(BaseModel): + sections: dict[str,CodeSection] = Field( - description="Sections of the codebase returned from the search.", - ) - search_query: str = Field( -@@ -46,7 +46,8 @@ class CodeRagAgent(Agent): - - self.ragTool = RAGTool( - default_index="codebase", -- index_paths=["../*.md","../*.py"], -+ index_paths=["../**/*.py","../**/*.md"], ++ description="Sections of the codebase returned from the search.", ++ ) ++ search_query: str = Field( ++ description="Query used to return this section.", ++ ) ++ ++class CodeRagAgent(Agent): ++ def __init__(self, ++ name="Code Rag Agent", ++ welcome="I am the Code Rag Agent. Please give me a search query (function name,class name, etc.) and I'll return relevant parts of the code.", ++ model: str=GPT_4O_MINI, ++ result_model = CodeSections, ++ **kwargs ++ ): ++ super().__init__( ++ name=name, ++ welcome=welcome, ++ model=model, ++ result_model=result_model, ++ **kwargs ++ ) ++ ++ self.ragTool = RAGTool( ++ default_index="codebase", ++ index_paths=["../**/*.md"], + recursive=True - ) - - -@@ -65,24 +66,28 @@ class CodeRagAgent(Agent): - searchQuery = request_context.get("query") - - searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5) -- -- allSections = CodeSections(sections=[],search_query=query) ++ ) ++ ++ ++ def next_turn( ++ self, ++ request: str|Prompt, ++ request_context: dict = {}, ++ request_id: str = None, ++ continue_result: dict = {}, ++ debug = "", ++ ) -> Generator[Event, Any, Any]: ++ ++ query = request.payload if isinstance(request, Prompt) else request ++ yield PromptStarted(query, {"query": query}) ++ ++ searchQuery = request_context.get("query") ++ ++ searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5) + + allSections = CodeSections(sections={},search_query=query) - - for nextResult in searchResult: -- print(nextResult) - file_path = nextResult["source_url"] -- similarity_score = nextResult["distance"] if nextResult["distance"] else 0 -- content = nextResult["content"] ++ ++ for nextResult in searchResult: ++ file_path = nextResult["source_url"] + if not file_path in allSections.sections: + #print(nextResult) + + similarity_score = nextResult["distance"] if nextResult["distance"] else 0 + content = nextResult["content"] - -- # Only works with Python files -- included_defs = [] -- try: -- with open(file_path) as file: -- node = ast.parse(file.read()) -- included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)] -- except: ++ + # Only works with Python files - included_defs = [] ++ included_defs = [] + try: + with open(file_path) as file: + node = ast.parse(file.read()) + included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)] + except: + included_defs = [] - -- allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score)) ++ + allSections.sections[file_path] = CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score) + #else: + #print("Skipping Duplicate: ",file_path) - - yield TurnEnd(self.name, [{"content": allSections}]) -diff --git a/src/agentic/events.py b/src/agentic/events.py -index 01aa68e..da12979 100644 ---- a/src/agentic/events.py -+++ b/src/agentic/events.py -@@ -656,7 +656,6 @@ class TurnEnd(Event): - def result(self): - """Safe result access with fallback""" - try: -- print(self.agent,self.messages) - return self.messages[-1]["content"] if self.messages else "No response generated" - except (IndexError, KeyError): - return "Error: Malformed response" ++ ++ yield TurnEnd(self.name, [{"content": allSections}]) +diff --git a/pr_agent/git_grep_agent.py b/pr_agent/git_grep_agent.py +new file mode 100644 +index 0000000..3567f53 +--- /dev/null ++++ b/pr_agent/git_grep_agent.py +@@ -0,0 +1,141 @@ ++from typing import Any, Generator, List ++from agentic.common import Agent ++from agentic.events import Event, Prompt, PromptStarted, TurnEnd ++from agentic.models import GPT_4O_MINI # model (using GPT for testing) ++import subprocess ++import ast ++import keyword ++import logging ++ ++from code_rag_agent import CodeSection, CodeSections ++ ++def find_full_function(file_path: str, line_number: int) -> str: ++ """Finds the full function definition given a file path and line number. Expects a properly formatted file.""" ++ ++ SUPPORTED_EXTENSIONS = [".py"] ++ line_number -= 1 # line numbers start at 1, not 0, bad for native zero indexing ++ try: ++ with open(file_path) as file: ++ text = file.read() ++ except Exception as e: ++ return f"Error with file: {e}" ++ ++ if not any(file_path.endswith(ext) for ext in SUPPORTED_EXTENSIONS): ++ logging.warning("File is not a supported extension, returning full file") ++ return text ++ ++ if file_path.endswith(".py"): ++ lines = text.splitlines() ++ ++ if len(lines) < line_number: # move this out of the if statement later ++ logging.error("Line number is out of bounds, returning full file") ++ return text ++ ++ # this function is "good enough" -- if there is any function that is not defined by the keyword module called outside of a class or function, it will keep it ++ def is_a_zero(line: str) -> bool: ++ if not line or line[0] == " " or not keyword.iskeyword(line.split()[0]): ++ return False ++ return True ++ ++ # The idea is to find the full class definition the function is embedded in, so we need to go up and down until we find this ++ top_index = line_number ++ ++ while (top_index > 0 and not is_a_zero(lines[top_index])): ++ top_index -= 1 ++ ++ bottom_index = line_number + 1 ++ while (bottom_index < len(lines) and not is_a_zero(lines[bottom_index])): ++ bottom_index += 1 ++ ++ return "\n".join(lines[top_index:bottom_index]) ++ ++ return text ++ ++# The actual sub-agent that runs git grep and returns structured results ++class GitGrepAgent(Agent): ++ def __init__(self, ++ name="Git-Grep Agent", ++ welcome="I am the Git Grep Agent. Please give me a search query (function name,class name, etc.) and I'll return exact matches from the codebase.", ++ model: str=GPT_4O_MINI, ++ result_model = CodeSections, ++ **kwargs ++ ): ++ super().__init__( ++ name=name, ++ welcome=welcome, ++ model=model, ++ result_model=result_model, ++ **kwargs ++ ) ++ ++ ++ def run_git_grep(self, query: str) -> List[tuple[str, str]]: ++ # Runs "git grep -n " for the given query to find exact matches in the codebase ++ # parses each result line into (file_path, matched_line) both of which are strs ++ # and returns a list of (file_path, matched_line) tuples ++ try: ++ result = subprocess.run( ++ ["git", "grep", "-n", query], # make sure that query is getting passed by the Main Agent!!! ++ capture_output=True, ++ text=True, ++ check=False ++ ) ++ ++ ++ # example git grep output: "code_rag_agent.py:6:from agentic.tools.rag_tool import RAGTool" ++ ++ ++ # TODO: need to determine if the line number is neccessary returning... ++ matches = [] # list of matches from the git grep command --> will hold all (file_path, matched_line) tuples found! ++ for line in result.stdout.splitlines(): ++ if not line: ++ continue ++ parts = line.split(":", 2) # file_path, line_number, line_text ++ if len(parts) >= 3: # if the output line is in the correct format ++ file_path, line_number, matched_line = parts ++ matches.append((file_path, find_full_function(file_path, int(line_number)))) ++ return matches ++ except Exception as e: ++ print(f"Error running git grep: {e}") ++ return [] ++ ++ ++ ++ # the entry point for running one turn (input -> processing -> output) ++ def next_turn( ++ self, ++ request: str | Prompt, ++ request_context: dict = {}, ++ request_id: str = None, ++ continue_result: dict = {}, ++ debug = "", ++ ) -> Generator[Event, Any, Any]: ++ # same as for the code_rag_context ++ ++ ++ # Either use query from request_context or from direct input ++ query = request.payload if isinstance(request, Prompt) else request # extracts the query from the incoming request ++ yield PromptStarted(query, {"query": query}) # yields a PromptStarted event to signal the beginning of processing ++ ++ ++ search_query = request_context.get("query") # pulls the actual search query from the request context ++ grep_results = self.run_git_grep(search_query) # runs git grep for that specific query ++ ++ ++ # TODO: verify that sections doesn't have to be a dictionary instead (like code_rag_agent implementation) ++ allSections = CodeSections(sections={}, search_query=search_query) # creates an empty CodeSections object ++ ++ # loops over each grep match ++ for file_path, content in grep_results: ++ if not file_path in allSections.sections: ++ allSections.sections[file_path] = CodeSection( ++ search_result=content, ++ file_path=file_path, ++ included_defs=[], ++ similarity_score=1.0 # grep doesn't do semantic scoring ++ ) ++ ++ yield TurnEnd(self.name, [{"content": allSections}]) ++ ++if __name__ == "__main__": ++ print(find_full_function("git_grep_agent.py", 172)) +\ No newline at end of file +diff --git a/pr_agent/summary_agent.py b/pr_agent/summary_agent.py +new file mode 100644 +index 0000000..8401bbc +--- /dev/null ++++ b/pr_agent/summary_agent.py +@@ -0,0 +1,100 @@ ++from agentic.common import Agent ++from agentic.models import CLAUDE ++from agentic.models import GPT_4O_MINI # model (using GPT for testing) ++ ++class SummaryAgent(Agent): ++ def __init__(self, ++ name="PR Summary Agent", ++ ++ # Agent instructions ++ instructions="""You are a code review assistant. Your task is to analyze a GitHub pull request using the provided information and generate helpful, precise feedback. Your response must include specific insights only about the files and code that were changed in the pull request. ++ ++Intended Purpose ++Given a pull request's patch file and supporting repository files for context, generate a high-quality comment summarizing the changes and providing constructive feedback. Focus exclusively on the changes introduced by the pull request. ++ ++Input Format ++You will be given: ++ ++A patch file describing the code changes. ++ ++Supporting code files from the repository that provide context for understanding the codebase. ++ ++The format will be as follows: ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++... ++Only the contents within the tags represent the actual changes. The rest are context files and should only be used to understand the repository structure and functionality. Do not comment on context files unless they are included in the patch. ++ ++Output Format ++Respond with the following structured sections: ++ ++Key Features ++A list of important or high-level features introduced by the changes. ++ ++Summary of Changes ++A clear and concise explanation of what was changed in the pull request, written in plain language. ++ ++New Unlocks from Functionality ++Describe any new capabilities or usage scenarios unlocked by these changes. ++ ++Code Suggestions with Line Number References ++Provide specific suggestions for improving the changed code, referring to lines by number as seen in the patch. ++ ++Formatting Suggestions ++Note any formatting or stylistic improvements that should be made. ++ ++Additional Notes ++Include any other relevant insights, such as potential edge cases, compatibility issues, or tests that should be added. ++ ++Important Rules ++ ++Only refer to files/lines that are explicitly changed in the patch file. ++ ++Use the provided file contents only to gain context for understanding the changes. ++ ++Be constructive, concise, and clear in your feedback. ++ """, ++ model=GPT_4O_MINI, ++ #model=CLAUDE, # model ++ **kwargs ++ ): ++ super().__init__( ++ name=name, ++ instructions=instructions, ++ model=model, ++ **kwargs ++ ) ++ ++ ++# Main to use the agent on the test files ++if __name__ == "__main__": ++ context = "\n" ++ with open("PR_code_review-agent/pr_agent/test_files/test_patch_file.txt", "r") as file: ++ context += file.read() ++ context += "\n\n" ++ context += "\n" ++ with open("PR_code_review-agent/pr_agent/test_files/agent_runner_copy.txt", "r") as file: ++ context += file.read() ++ context += "\n\n" ++ context += "\n" ++ with open("PR_code_review-agent/pr_agent/test_files/agent_copy.txt", "r") as file: ++ context += file.read() ++ context += "\n\n" ++ context += "\n" ++ with open("PR_code_review-agent/pr_agent/test_files/weather_tool_copy.txt", "r") as file: ++ context += file.read() ++ context += "" ++ ++ agent = SummaryAgent() ++ print(agent << context) +\ No newline at end of file diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py -index e3e7280..9891e7c 100644 +index 43d27d3..0c03848 100644 --- a/src/agentic/tools/rag_tool.py +++ b/src/agentic/tools/rag_tool.py -@@ -14,6 +14,7 @@ from agentic.utils.rag_helper import ( +@@ -14,6 +14,8 @@ from agentic.utils.rag_helper import ( init_embedding_model, init_chunker, rag_index_file, + rag_index_multiple_files, ++ delete_document_from_index, ) from agentic.utils.summarizer import generate_document_summary -@@ -44,6 +45,8 @@ class RAGTool(BaseAgenticTool): +@@ -22,6 +24,7 @@ from weaviate.classes.query import Filter, HybridFusion + from weaviate.collections.classes.grpc import Sort + from weaviate.classes.config import VectorDistances + ++from rich.console import Console + + @tool_registry.register( + name="RAGTool", +@@ -38,20 +41,59 @@ class RAGTool(BaseAgenticTool): + def __init__( + self, + default_index: str = "knowledge_base", +- index_paths: list[str] = [] ++ index_paths: list[str] = [], ++ recursive: bool = False, ++ overwrite_index = True, + ): # Construct the RAG tool. You can pass a list of files and we will ensure that # they are added to the index on startup. Paths can include glob patterns also, # like './docs/*.md'. @@ -217,20 +673,61 @@ index e3e7280..9891e7c 100644 self.default_index = default_index self.index_paths = index_paths if self.index_paths: -@@ -51,8 +54,11 @@ class RAGTool(BaseAgenticTool): + client = init_weaviate() if default_index not in list_collections(client): create_collection(client, default_index, VectorDistances.COSINE) ++ ++ # Keep track of files found during initialization ++ if overwrite_index: ++ indexed_documents = {} ++ for path in index_paths: -- for file_path in [path] if path.startswith("http") else glob.glob(path, recursive=recursive): +- for file_path in [path] if path.startswith("http") else glob.glob(path): - rag_index_file(file_path, self.default_index, client=client, ignore_errors=True) + if path.startswith("http"): -+ rag_index_file(path, self.default_index, client=client, ignore_errors=True) ++ document_id = rag_index_file(path, self.default_index, client=client, ignore_errors=True) ++ ++ if overwrite_index: ++ indexed_documents[document_id] = True ++ + else: + file_paths = glob.glob(path, recursive=recursive) -+ rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True) ++ document_ids = rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True) ++ ++ if overwrite_index: ++ for document_id in document_ids: ++ indexed_documents[document_id] = True ++ ++ # Delete indexed files not found during initialization ++ if overwrite_index: ++ try: ++ console = Console() ++ collection = client.collections.get(self.default_index) ++ documents = list_documents_in_collection(collection) ++ ++ for document in documents: ++ if not document["document_id"] in indexed_documents: ++ console.print(f"[bold green]✅ Removing deleted file {document['filename']} from index") ++ delete_document_from_index(collection=collection,document_id=document["document_id"],filename=document["filename"]) ++ ++ except Exception as e: ++ print(f"Error listing documents: {str(e)}") ++ return ++ finally: ++ if client: ++ client.close() def get_tools(self) -> List[Callable]: return [ +@@ -59,7 +101,7 @@ class RAGTool(BaseAgenticTool): + #self.list_indexes, + self.search_knowledge_index, + self.list_documents, +- self.review_full_document ++ self.review_full_document, + ] + + def save_content_to_knowledge_index( diff --git a/src/agentic/utils/file_reader.py b/src/agentic/utils/file_reader.py index 3ea5ae2..70671fb 100644 --- a/src/agentic/utils/file_reader.py @@ -246,13 +743,46 @@ index 3ea5ae2..70671fb 100644 return textract.process(file_path).decode('utf-8'), mime_type except Exception as e: diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py -index 94dd7f5..1d585f2 100644 +index 94dd7f5..89a1f39 100644 --- a/src/agentic/utils/rag_helper.py +++ b/src/agentic/utils/rag_helper.py -@@ -234,7 +234,96 @@ def rag_index_file( +@@ -91,7 +91,7 @@ def prepare_document_metadata( + + # Generate document ID from filename + metadata["document_id"] = hashlib.sha256( +- metadata["filename"].encode() ++ str(Path(file_path)).encode() + ).hexdigest() + + return metadata +@@ -155,7 +155,7 @@ def rag_index_file( + client: WeaviateClient|None = None, + ignore_errors: bool = False, + distance_metric: VectorDistances = VectorDistances.COSINE, +-): ++) -> str: + """Index a file using configurable Weaviate Embedded and chunking parameters""" + + console = Console() +@@ -186,10 +186,10 @@ def rag_index_file( + + if status == "unchanged": + console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]") +- return ++ return metadata["document_id"] + elif status == "duplicate": + console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]") +- return ++ return metadata["document_id"] + elif status == "changed": + console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]") + collection.data.delete_many( +@@ -233,8 +233,102 @@ def rag_index_file( + finally: if client and client_created: client.close() - return "indexed" +- return "indexed" ++ return metadata["document_id"] + +def rag_index_multiple_files( + file_paths: List[str], @@ -263,11 +793,13 @@ index 94dd7f5..1d585f2 100644 + client: WeaviateClient|None = None, + ignore_errors: bool = False, + distance_metric: VectorDistances = VectorDistances.COSINE, -+): ++) -> List[str]: + """Index a file using configurable Weaviate Embedded and chunking parameters""" + + console = Console() + client_created = False ++ ++ documents_indexed = [] + try: + with Status("[bold green]Initializing Weaviate..."): + if client is None: @@ -295,9 +827,11 @@ index 94dd7f5..1d585f2 100644 + + if status == "unchanged": + console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]") ++ documents_indexed.append(metadata["document_id"]) + continue + elif status == "duplicate": + console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]") ++ documents_indexed.append(metadata["document_id"]) + continue + elif status == "changed": + console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]") @@ -337,12 +871,16 @@ index 94dd7f5..1d585f2 100644 + }, + vector=vector + ) -+ -+ console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}") ++ ++ documents_indexed.append(metadata["document_id"]) ++ console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}") + finally: + if client and client_created: + client.close() -+ return "indexed" ++ return documents_indexed def delete_document_from_index( collection: Any, +diff --git a/test b/test +new file mode 100644 +index 0000000..e69de29