Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .github/workflows/pr-summary-agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ jobs:

- name: Install uv
uses: astral-sh/setup-uv@v6


- name: Download a single artifact
uses: actions/download-artifact@v4

- name: Run agent
run: |
uv venv --python 3.12
Expand All @@ -33,3 +36,9 @@ jobs:
GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }}
REPO_OWNER: supercog-ai
REPO_NAME: PR_code_review-agent

- name: Update weaviate cache
uses: actions/upload-artifact@v4
with:
name: weaviate
path: home/.cache/weaviate
348 changes: 348 additions & 0 deletions PRChangesTest.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
index 88e50da..1fe312b 100644
--- a/.github/workflows/pr-summary-agent.yml
+++ b/.github/workflows/pr-summary-agent.yml
@@ -25,10 +25,8 @@ jobs:
uv pip install -e "../PR_code_review-agent[all,dev]" --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-first-match
git diff --merge-base HEAD^1 HEAD > PRChanges.patch
cat PRChanges.patch
- uv run pr_agent/test_files/mock_pr_agent.py
-
-#uv run pr_agent/PR_agent.py
-
+ uv run pr_agent/PR_agent.py
+
env:
OPENAI_API_KEY: ${{ secrets.PRAgentOpenAIKey }}
PR_ID: ${{ github.event.pull_request.number }}
diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
index 46b8f1f..c99d863 100644
--- a/pr_agent/PR_agent.py
+++ b/pr_agent/PR_agent.py
@@ -29,14 +29,6 @@ class SearchResult(BaseModel):
similarity_score: float = Field(
desciption="Similarity score returned from vector search."
)
- is_relevant: bool = Field(
- default = True,
- description="Boolean describing if the search result is relevant to the query."
- )
- relevance_reason: str = Field(
- default = "",
- description="Boolean describing if the search result is relevant to the query."
- )
included_defs: List[str] = Field(
default_factory=list,
desciption="Similarity score returned from vector search."
@@ -49,7 +41,6 @@ class Searches(BaseModel):

class RelevanceResult(BaseModel):
relevant: bool
- reason: str

class PRReviewAgent(Agent):

@@ -81,7 +72,7 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a

self.relevanceAgent = Agent(
name="Code Relevange Agent",
- instructions="""You are an expert in determining if a snippet of code or documentation is needed to determine the purpose of a code change from the patch file. Your response must include a 'relevant' field boolean and a 'reason' field with a brief explanation.""",
+ instructions="""You are an expert in determining if a snippet of code or documentation is directly relevant to a query. Your response must include a 'relevant' field boolean.""",
model=GPT_4O_MINI,
result_model=RelevanceResult,
)
@@ -142,10 +133,10 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
}
)

- print("quer"+str(queries))
+ print("queries: "+str(queries))

- all_results = []
-
+ # RAG queries
+ all_results = {}
for query in queries.searches[:10]:
searchResponse = yield from self.code_rag_agent.final_result(
f"Search codebase",
@@ -156,35 +147,34 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
)

# Process each result
- for result in searchResponse.sections:
- all_results.append(SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs))
+ for key, result in searchResponse.sections.items():
+ if not key in all_results:
+ all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)

- print("fil"+str(all_results))
+ print("all: "+str(all_results))

# Filter search results using LLM-based relevance checking
filtered_results = []
-
- for result in all_results:
- if result.similarity_score < 0.5:
- continue
-
- relevance_check = yield from self.relevanceAgent.final_result(
- f"<Patch File>\n{request_context.get("patch_content")}\n</Patch File>\n\n<Content>{result.content}</Content><Query>{result.query}</Query>"
- )
-
- print(relevance_check)
-
- result.is_relevant = relevance_check.relevant
- result.relevance_reason = relevance_check.reason
+ for result in all_results.values():

- if result.is_relevant:
- filtered_results.append(result)
+ try:
+ relevance_check = yield from self.relevanceAgent.final_result(
+ f"<Patch File>\n{request_context.get("patch_content")}\n</Patch File>\n\n<Content>{result.content}</Content><Query>{result.query}</Query>"
+ )
+
+ if relevance_check.relevant:
+ filtered_results.append(result)
+ except Exception as e:
+ # LLM error
+ print(e)

- print(str(filtered_results))
+ print("filtered: ",str(filtered_results))

# Prepare for summary
formatted_str = self.prepare_summary(request_context.get("patch_content"),filtered_results)

+ print(formatted_str)
+
summary = yield from self.summaryAgent.final_result(
formatted_str
)
diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py
index b7f36df..32d0a34 100644
--- a/pr_agent/code_rag_agent.py
+++ b/pr_agent/code_rag_agent.py
@@ -21,7 +21,7 @@ class CodeSection(BaseModel):
)

class CodeSections(BaseModel):
- sections: List[CodeSection] = Field(
+ sections: dict[str,CodeSection] = Field(
description="Sections of the codebase returned from the search.",
)
search_query: str = Field(
@@ -46,7 +46,8 @@ class CodeRagAgent(Agent):

self.ragTool = RAGTool(
default_index="codebase",
- index_paths=["../*.md","../*.py"],
+ index_paths=["../**/*.py","../**/*.md"],
+ recursive=True
)


@@ -65,24 +66,28 @@ class CodeRagAgent(Agent):
searchQuery = request_context.get("query")

searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5)
-
- allSections = CodeSections(sections=[],search_query=query)
+
+ allSections = CodeSections(sections={},search_query=query)

for nextResult in searchResult:
- print(nextResult)
file_path = nextResult["source_url"]
- similarity_score = nextResult["distance"] if nextResult["distance"] else 0
- content = nextResult["content"]
+ if not file_path in allSections.sections:
+ #print(nextResult)
+
+ similarity_score = nextResult["distance"] if nextResult["distance"] else 0
+ content = nextResult["content"]

- # Only works with Python files
- included_defs = []
- try:
- with open(file_path) as file:
- node = ast.parse(file.read())
- included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
- except:
+ # Only works with Python files
included_defs = []
+ try:
+ with open(file_path) as file:
+ node = ast.parse(file.read())
+ included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
+ except:
+ included_defs = []

- allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score))
+ allSections.sections[file_path] = CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score)
+ #else:
+ #print("Skipping Duplicate: ",file_path)

yield TurnEnd(self.name, [{"content": allSections}])
diff --git a/src/agentic/events.py b/src/agentic/events.py
index 01aa68e..da12979 100644
--- a/src/agentic/events.py
+++ b/src/agentic/events.py
@@ -656,7 +656,6 @@ class TurnEnd(Event):
def result(self):
"""Safe result access with fallback"""
try:
- print(self.agent,self.messages)
return self.messages[-1]["content"] if self.messages else "No response generated"
except (IndexError, KeyError):
return "Error: Malformed response"
diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py
index e3e7280..9891e7c 100644
--- a/src/agentic/tools/rag_tool.py
+++ b/src/agentic/tools/rag_tool.py
@@ -14,6 +14,7 @@ from agentic.utils.rag_helper import (
init_embedding_model,
init_chunker,
rag_index_file,
+ rag_index_multiple_files,
)

from agentic.utils.summarizer import generate_document_summary
@@ -44,6 +45,8 @@ class RAGTool(BaseAgenticTool):
# Construct the RAG tool. You can pass a list of files and we will ensure that
# they are added to the index on startup. Paths can include glob patterns also,
# like './docs/*.md'.
+ # Enable recursive (**.md) glob patterns with recursive = True
+
self.default_index = default_index
self.index_paths = index_paths
if self.index_paths:
@@ -51,8 +54,11 @@ class RAGTool(BaseAgenticTool):
if default_index not in list_collections(client):
create_collection(client, default_index, VectorDistances.COSINE)
for path in index_paths:
- for file_path in [path] if path.startswith("http") else glob.glob(path, recursive=recursive):
- rag_index_file(file_path, self.default_index, client=client, ignore_errors=True)
+ if path.startswith("http"):
+ rag_index_file(path, self.default_index, client=client, ignore_errors=True)
+ else:
+ file_paths = glob.glob(path, recursive=recursive)
+ rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True)

def get_tools(self) -> List[Callable]:
return [
diff --git a/src/agentic/utils/file_reader.py b/src/agentic/utils/file_reader.py
index 3ea5ae2..70671fb 100644
--- a/src/agentic/utils/file_reader.py
+++ b/src/agentic/utils/file_reader.py
@@ -57,6 +57,9 @@ def read_file(file_path: str, mime_type: str|None = None) -> tuple[str, str]:
return text, mime_type
elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
return pd.read_excel(file_path).to_csv(), mime_type
+ elif mime_type == "text/x-python":
+ with open(file_path,"r") as f:
+ return f.read(), mime_type
else:
return textract.process(file_path).decode('utf-8'), mime_type
except Exception as e:
diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py
index 94dd7f5..1d585f2 100644
--- a/src/agentic/utils/rag_helper.py
+++ b/src/agentic/utils/rag_helper.py
@@ -234,7 +234,96 @@ def rag_index_file(
if client and client_created:
client.close()
return "indexed"
+
+def rag_index_multiple_files(
+ file_paths: List[str],
+ index_name: str,
+ chunk_threshold: float = 0.5,
+ chunk_delimiters: str = ". ,! ,? ,\n",
+ embedding_model: str = "BAAI/bge-small-en-v1.5",
+ client: WeaviateClient|None = None,
+ ignore_errors: bool = False,
+ distance_metric: VectorDistances = VectorDistances.COSINE,
+):
+ """Index a file using configurable Weaviate Embedded and chunking parameters"""
+
+ console = Console()
+ client_created = False
+ try:
+ with Status("[bold green]Initializing Weaviate..."):
+ if client is None:
+ client = init_weaviate()
+ client_created = True
+ create_collection(client, index_name, distance_metric)
+
+ with Status("[bold green]Initializing models..."):
+ embed_model = init_embedding_model(embedding_model)
+ chunker = init_chunker(chunk_threshold, chunk_delimiters)

+ for file_path in file_paths:
+ with Status(f"[bold green]Processing {file_path}...", console=console):
+ text, mime_type = read_file(str(file_path))
+ metadata = prepare_document_metadata(file_path, text, mime_type, GPT_DEFAULT_MODEL)
+
+ console.print(f"[bold green]Indexing {file_path}...")
+
+ collection = client.collections.get(index_name)
+ exists, status = check_document_exists(
+ collection,
+ metadata["document_id"],
+ metadata["fingerprint"]
+ )
+
+ if status == "unchanged":
+ console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]")
+ continue
+ elif status == "duplicate":
+ console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]")
+ continue
+ elif status == "changed":
+ console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]")
+ collection.data.delete_many(
+ where=Filter.by_property("document_id").equal(metadata["document_id"])
+ )
+
+ with Status("[bold green]Generating document summary...", console=console):
+ metadata["summary"] = generate_document_summary(
+ text=text[:12000],
+ mime_type=mime_type,
+ model=GPT_DEFAULT_MODEL
+ )
+
+ chunks = chunker(text)
+ chunks_text = [chunk.text for chunk in chunks]
+ if not chunks_text:
+ if ignore_errors:
+ return client
+ raise ValueError("No text chunks generated from document")
+
+ batch_size = 128
+ embeddings = []
+ with Status("[bold green]Generating embeddings..."):
+ for i in range(0, len(chunks_text), batch_size):
+ batch = chunks_text[i:i+batch_size]
+ embeddings.extend(list(embed_model.embed(batch)))
+
+ with Status("[bold green]Indexing chunks..."), collection.batch.dynamic() as batch:
+ for i, chunk in enumerate(chunks):
+ vector = embeddings[i].tolist()
+ batch.add_object(
+ properties={
+ **metadata,
+ "content": chunk.text,
+ "chunk_index": i,
+ },
+ vector=vector
+ )
+
+ console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}")
+ finally:
+ if client and client_created:
+ client.close()
+ return "indexed"

def delete_document_from_index(
collection: Any,
Loading
Loading