supercog-ai · BrandonwLii · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
@@ -18,7 +18,10 @@ jobs:
 
       - name: Install uv
         uses: astral-sh/setup-uv@v6
-
+
+      - name: Download a single artifact
+        uses: actions/download-artifact@v4
+
       - name: Run agent
         run: |
           uv venv --python 3.12
@@ -33,3 +36,9 @@ jobs:
           GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }}
           REPO_OWNER: supercog-ai
           REPO_NAME: PR_code_review-agent
+
+      - name: Update weaviate cache
+        uses: actions/upload-artifact@v4
+        with:
+          name: weaviate
+          path: home/.cache/weaviate
diff --git a/PRChangesTest.patch b/PRChangesTest.patch
@@ -0,0 +1,348 @@
+diff --git a/.github/workflows/pr-summary-agent.yml b/.github/workflows/pr-summary-agent.yml
+index 88e50da..1fe312b 100644
+--- a/.github/workflows/pr-summary-agent.yml
++++ b/.github/workflows/pr-summary-agent.yml
+@@ -25,10 +25,8 @@ jobs:
+           uv pip install -e "../PR_code_review-agent[all,dev]" --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-first-match
+           git diff --merge-base HEAD^1 HEAD > PRChanges.patch
+           cat PRChanges.patch
+-          uv run pr_agent/test_files/mock_pr_agent.py
+-          
+-#uv run pr_agent/PR_agent.py  
+-
++          uv run pr_agent/PR_agent.py
++    
+         env: 
+           OPENAI_API_KEY: ${{ secrets.PRAgentOpenAIKey }}
+           PR_ID: ${{ github.event.pull_request.number }}
+diff --git a/pr_agent/PR_agent.py b/pr_agent/PR_agent.py
+index 46b8f1f..c99d863 100644
+--- a/pr_agent/PR_agent.py
++++ b/pr_agent/PR_agent.py
+@@ -29,14 +29,6 @@ class SearchResult(BaseModel):
+     similarity_score: float = Field(
+         desciption="Similarity score returned from vector search."
+     )
+-    is_relevant: bool = Field(
+-        default = True,
+-        description="Boolean describing if the search result is relevant to the query."
+-    )
+-    relevance_reason: str = Field(
+-        default = "",
+-        description="Boolean describing if the search result is relevant to the query."
+-    )
+     included_defs: List[str] = Field(
+         default_factory=list,
+         desciption="Similarity score returned from vector search."
+@@ -49,7 +41,6 @@ class Searches(BaseModel):
+
+ class RelevanceResult(BaseModel):
+     relevant: bool 
+-    reason: str
+
+ class PRReviewAgent(Agent):
+
+@@ -81,7 +72,7 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
+
+         self.relevanceAgent = Agent(
+             name="Code Relevange Agent",
+-            instructions="""You are an expert in determining if a snippet of code or documentation is needed to determine the purpose of a code change from the patch file. Your response must include a 'relevant' field boolean and a 'reason' field with a brief explanation.""",
++            instructions="""You are an expert in determining if a snippet of code or documentation is directly relevant to a query. Your response must include a 'relevant' field boolean.""",
+             model=GPT_4O_MINI,
+             result_model=RelevanceResult,
+         )
+@@ -142,10 +133,10 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
+             }
+         )
+
+-        print("quer"+str(queries))
++        print("queries: "+str(queries))
+
+-        all_results = []
+-    
++        # RAG queries
++        all_results = {}
+         for query in queries.searches[:10]:
+             searchResponse = yield from self.code_rag_agent.final_result(
+                 f"Search codebase",
+@@ -156,35 +147,34 @@ You are an expert in generating NON-NATURAL LANGUAGE CODE search queries from a
+             )
+
+             # Process each result
+-            for result in searchResponse.sections:
+-                all_results.append(SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs))
++            for key, result in searchResponse.sections.items():
++                if not key in all_results:
++                    all_results[key] = SearchResult(query=query,file_path=result.file_path,content=result.search_result,similarity_score=result.similarity_score,included_defs=result.included_defs)
+
+-        print("fil"+str(all_results))
++        print("all: "+str(all_results))
+
+         # Filter search results using LLM-based relevance checking
+         filtered_results = []
+-        
+-        for result in all_results: 
+-            if result.similarity_score < 0.5:
+-                continue
+-                
+-            relevance_check = yield from self.relevanceAgent.final_result(
+-                f"<Patch File>\n{request_context.get("patch_content")}\n</Patch File>\n\n<Content>{result.content}</Content><Query>{result.query}</Query>"
+-            )
+-
+-            print(relevance_check)
+-            
+-            result.is_relevant = relevance_check.relevant
+-            result.relevance_reason = relevance_check.reason
++        for result in all_results.values(): 
+
+-            if result.is_relevant:
+-                filtered_results.append(result)
++            try:
++                relevance_check = yield from self.relevanceAgent.final_result(
++                    f"<Patch File>\n{request_context.get("patch_content")}\n</Patch File>\n\n<Content>{result.content}</Content><Query>{result.query}</Query>"
++                )
++                
++                if relevance_check.relevant:
++                    filtered_results.append(result)
++            except Exception as e:
++                # LLM error
++                print(e)
+
+-        print(str(filtered_results))
++        print("filtered: ",str(filtered_results))
+
+         # Prepare for summary
+         formatted_str = self.prepare_summary(request_context.get("patch_content"),filtered_results)
+
++        print(formatted_str)
++
+         summary = yield from self.summaryAgent.final_result(
+             formatted_str
+         )
+diff --git a/pr_agent/code_rag_agent.py b/pr_agent/code_rag_agent.py
+index b7f36df..32d0a34 100644
+--- a/pr_agent/code_rag_agent.py
++++ b/pr_agent/code_rag_agent.py
+@@ -21,7 +21,7 @@ class CodeSection(BaseModel):
+     )
+
+ class CodeSections(BaseModel):
+-    sections: List[CodeSection] = Field(
++    sections: dict[str,CodeSection] = Field(
+         description="Sections of the codebase returned from the search.",
+     )
+     search_query: str = Field(
+@@ -46,7 +46,8 @@ class CodeRagAgent(Agent):
+
+         self.ragTool = RAGTool(
+                 default_index="codebase",
+-                index_paths=["../*.md","../*.py"],
++                index_paths=["../**/*.py","../**/*.md"],
++                recursive=True
+             )
+
+
+@@ -65,24 +66,28 @@ class CodeRagAgent(Agent):
+         searchQuery = request_context.get("query")
+
+         searchResult = self.ragTool.search_knowledge_index(query=searchQuery,limit=5)
+-
+-        allSections = CodeSections(sections=[],search_query=query)
++        
++        allSections = CodeSections(sections={},search_query=query)
+
+         for nextResult in searchResult:
+-            print(nextResult)
+             file_path = nextResult["source_url"]
+-            similarity_score = nextResult["distance"] if nextResult["distance"] else 0
+-            content = nextResult["content"]
++            if not file_path in allSections.sections:
++                #print(nextResult)
++                
++                similarity_score = nextResult["distance"] if nextResult["distance"] else 0
++                content = nextResult["content"]
+
+-            # Only works with Python files
+-            included_defs = []
+-            try:
+-                with open(file_path) as file:
+-                    node = ast.parse(file.read())
+-                    included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
+-            except:
++                # Only works with Python files
+                 included_defs = []
++                try:
++                    with open(file_path) as file:
++                        node = ast.parse(file.read())
++                        included_defs = [n.name for n in node.body if isinstance(n, ast.ClassDef) or isinstance(n, ast.FunctionDef)]
++                except:
++                    included_defs = []
+
+-            allSections.sections.append(CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score))
++                allSections.sections[file_path] = CodeSection(search_result=content,file_path=file_path,included_defs=included_defs,similarity_score=similarity_score)
++            #else:
++                #print("Skipping Duplicate: ",file_path)
+
+         yield TurnEnd(self.name, [{"content": allSections}])
+diff --git a/src/agentic/events.py b/src/agentic/events.py
+index 01aa68e..da12979 100644
+--- a/src/agentic/events.py
++++ b/src/agentic/events.py
+@@ -656,7 +656,6 @@ class TurnEnd(Event):
+     def result(self):
+         """Safe result access with fallback"""
+         try:
+-            print(self.agent,self.messages)
+             return self.messages[-1]["content"] if self.messages else "No response generated"
+         except (IndexError, KeyError):
+             return "Error: Malformed response"
+diff --git a/src/agentic/tools/rag_tool.py b/src/agentic/tools/rag_tool.py
+index e3e7280..9891e7c 100644
+--- a/src/agentic/tools/rag_tool.py
++++ b/src/agentic/tools/rag_tool.py
+@@ -14,6 +14,7 @@ from agentic.utils.rag_helper import (
+     init_embedding_model,
+     init_chunker,
+     rag_index_file,
++    rag_index_multiple_files,
+ )
+
+ from agentic.utils.summarizer import generate_document_summary
+@@ -44,6 +45,8 @@ class RAGTool(BaseAgenticTool):
+         # Construct the RAG tool. You can pass a list of files and we will ensure that
+         # they are added to the index on startup. Paths can include glob patterns also,
+         # like './docs/*.md'.
++        # Enable recursive (**.md) glob patterns with recursive = True
++
+         self.default_index = default_index
+         self.index_paths = index_paths
+         if self.index_paths:
+@@ -51,8 +54,11 @@ class RAGTool(BaseAgenticTool):
+             if default_index not in list_collections(client):
+                 create_collection(client, default_index, VectorDistances.COSINE)
+             for path in index_paths:
+-                for file_path in [path] if path.startswith("http") else glob.glob(path, recursive=recursive):
+-                    rag_index_file(file_path, self.default_index, client=client, ignore_errors=True)
++                if path.startswith("http"):
++                    rag_index_file(path, self.default_index, client=client, ignore_errors=True)
++                else:
++                    file_paths = glob.glob(path, recursive=recursive)
++                    rag_index_multiple_files(file_paths, self.default_index, client=client, ignore_errors=True)
+
+     def get_tools(self) -> List[Callable]:
+         return [
+diff --git a/src/agentic/utils/file_reader.py b/src/agentic/utils/file_reader.py
+index 3ea5ae2..70671fb 100644
+--- a/src/agentic/utils/file_reader.py
++++ b/src/agentic/utils/file_reader.py
+@@ -57,6 +57,9 @@ def read_file(file_path: str, mime_type: str|None = None) -> tuple[str, str]:
+                 return text, mime_type
+         elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+             return pd.read_excel(file_path).to_csv(), mime_type
++        elif mime_type == "text/x-python":
++            with open(file_path,"r") as f:
++                return f.read(), mime_type
+         else:
+             return textract.process(file_path).decode('utf-8'), mime_type
+     except Exception as e:
+diff --git a/src/agentic/utils/rag_helper.py b/src/agentic/utils/rag_helper.py
+index 94dd7f5..1d585f2 100644
+--- a/src/agentic/utils/rag_helper.py
++++ b/src/agentic/utils/rag_helper.py
+@@ -234,7 +234,96 @@ def rag_index_file(
+         if client and client_created:
+             client.close()
+     return "indexed"
++
++def rag_index_multiple_files(
++    file_paths: List[str],
++    index_name: str,
++    chunk_threshold: float = 0.5,
++    chunk_delimiters: str = ". ,! ,? ,\n",
++    embedding_model: str = "BAAI/bge-small-en-v1.5",
++    client: WeaviateClient|None = None,
++    ignore_errors: bool = False,
++    distance_metric: VectorDistances = VectorDistances.COSINE,
++):
++    """Index a file using configurable Weaviate Embedded and chunking parameters"""
++
++    console = Console()
++    client_created = False
++    try:
++        with Status("[bold green]Initializing Weaviate..."):
++            if client is None:
++                client = init_weaviate()
++                client_created = True
++            create_collection(client, index_name, distance_metric)
++            
++        with Status("[bold green]Initializing models..."):
++            embed_model = init_embedding_model(embedding_model)
++            chunker = init_chunker(chunk_threshold, chunk_delimiters)
+
++        for file_path in file_paths:
++            with Status(f"[bold green]Processing {file_path}...", console=console):
++                text, mime_type = read_file(str(file_path))
++                metadata = prepare_document_metadata(file_path, text, mime_type, GPT_DEFAULT_MODEL)
++
++            console.print(f"[bold green]Indexing {file_path}...")
++
++            collection = client.collections.get(index_name)
++            exists, status = check_document_exists(
++                collection, 
++                metadata["document_id"],
++                metadata["fingerprint"]
++            )
++            
++            if status == "unchanged":
++                console.print(f"[yellow]⏩ Document '{metadata['filename']}' unchanged[/yellow]")
++                continue
++            elif status == "duplicate":
++                console.print(f"[yellow]⚠️ Content already exists under different filename[/yellow]")
++                continue
++            elif status == "changed":
++                console.print(f"[yellow]🔄 Updating changed document '{metadata['filename']}'[/yellow]")
++                collection.data.delete_many(
++                    where=Filter.by_property("document_id").equal(metadata["document_id"])
++                )
++
++            with Status("[bold green]Generating document summary...", console=console):
++                metadata["summary"] = generate_document_summary(
++                    text=text[:12000],
++                    mime_type=mime_type,
++                    model=GPT_DEFAULT_MODEL
++                )
++            
++            chunks = chunker(text)
++            chunks_text = [chunk.text for chunk in chunks]
++            if not chunks_text:
++                if ignore_errors:
++                    return client
++                raise ValueError("No text chunks generated from document")
++            
++            batch_size = 128
++            embeddings = []
++            with Status("[bold green]Generating embeddings..."):
++                for i in range(0, len(chunks_text), batch_size):
++                    batch = chunks_text[i:i+batch_size]
++                    embeddings.extend(list(embed_model.embed(batch)))
++            
++            with Status("[bold green]Indexing chunks..."), collection.batch.dynamic() as batch:
++                for i, chunk in enumerate(chunks):
++                    vector = embeddings[i].tolist()
++                    batch.add_object(
++                        properties={
++                            **metadata,
++                        "content": chunk.text,
++                        "chunk_index": i,
++                        },
++                        vector=vector
++                    )
++                
++        console.print(f"[bold green]✅ Indexed {len(chunks)} chunks in {index_name}")
++    finally:
++        if client and client_created:
++            client.close()
++    return "indexed"
+
+ def delete_document_from_index(
+     collection: Any,