diff --git a/README.md b/README.md
index 20beca5..cdd47a0 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,6 @@ async with CZeroEngineClient() as client:
     response = await client.chat(
         message="What are the key features?",
         use_rag=True,
-        chunk_limit=5,
         similarity_threshold=0.3  # Lower threshold for better recall
     )
     
@@ -99,7 +98,7 @@ async with CZeroEngineClient() as client:
         use_rag=False
     )
     response_with_rag = await client.chat(
-        message="Explain semantic search",
+        message="Explain semantic search", 
         use_rag=True,
         similarity_threshold=0.3
     )
@@ -122,7 +121,7 @@ results = await client.semantic_search(
 # Use direct client for persona interactions
 async with CZeroEngineClient() as client:
     # Chat with default Gestalt persona
-    response = await client.chat_with_persona(
+    response = await client.persona_chat(
         persona_id="gestalt-default",  # default persona
         message="Analyze the implications of AGI"
     )
@@ -130,7 +129,8 @@ async with CZeroEngineClient() as client:
     # Or use regular chat (defaults to Gestalt if no persona specified)
     response = await client.chat(
         message="What are the key features of CZero Engine?",
-        use_rag=True
+        use_rag=True,
+        workspace_filter="workspace-id"  # Optional: Filter to specific workspace
     )
 ```
 
@@ -187,7 +187,7 @@ from langchain_anthropic import ChatAnthropic
 
 # Use multiple LLMs in your workflow
 cloud_llm = ChatOpenAI(model="gpt-4")  # Or Anthropic, Google, etc.
-local_llm = CZeroEngineLLM()  # Your local CZero Engine
+local_llm = CZeroLLM()  # Your local CZero Engine
 
 # The possibilities are endless! 🚀
 ```
@@ -269,7 +269,6 @@ uv run czero version
 | `/api/health` | GET | System health check |
 | `/api/chat/send` | POST | LLM chat with optional RAG |
 | `/api/vector/search/semantic` | POST | Semantic search with hierarchy |
-| `/api/vector/search/similarity` | POST | Find similar chunks |
 | `/api/embeddings/generate` | POST | Generate text embeddings |
 | `/api/workspaces/create` | POST | Create workspace |
 | `/api/workspaces/process` | POST | Process documents |
diff --git a/examples/03_persona_interactions.py b/examples/03_persona_interactions.py
index 455d466..a60835a 100644
--- a/examples/03_persona_interactions.py
+++ b/examples/03_persona_interactions.py
@@ -161,13 +161,32 @@ async def persona_with_rag():
     print("-" * 30)
     
     async with CZeroEngineClient() as client:
-        # Use persona chat with RAG context
+        # First, list workspaces to find one with documents
+        workspaces = await client.list_workspaces()
+        workspace_id = None
+        
+        if workspaces.workspaces:
+            # Use the first available workspace
+            workspace_id = workspaces.workspaces[0].id
+            print(f"📁 Using workspace: {workspaces.workspaces[0].name}")
+        else:
+            print("⚠️ No workspaces found. Creating a sample workspace...")
+            # Create a sample workspace if none exist
+            import tempfile
+            with tempfile.TemporaryDirectory() as temp_dir:
+                workspace = await client.create_workspace(
+                    name="Sample Workspace",
+                    path=temp_dir
+                )
+                workspace_id = workspace.id
+        
+        # Use persona chat with RAG context from workspace
         print("\n🔍 Asking Gestalt with document context...\n")
         
-        # This would use any processed documents in your workspace
         response = await client.persona_chat(
             persona_id="gestalt-default",  # Use real persona
             message="Based on the documents, what are the key features of CZero Engine?",
+            workspace_filter=workspace_id,  # Enable RAG with this workspace
             max_tokens=100  # Moderate response
         )
         
diff --git a/examples/05_langgraph_integration.py b/examples/05_langgraph_integration.py
index dc29892..3151f68 100644
--- a/examples/05_langgraph_integration.py
+++ b/examples/05_langgraph_integration.py
@@ -64,6 +64,7 @@ class CZeroEngineLLM(BaseChatModel):
     temperature: float = 0.7
     base_url: str = "http://localhost:1421"
     persona_id: str = "gestalt-default"
+    workspace_id: Optional[str] = None  # For RAG context
     
     class Config:
         arbitrary_types_allowed = True
@@ -123,7 +124,8 @@ async def _agenerate(
                 message=prompt,
                 system_prompt_template=system_prompt,
                 max_tokens=self.max_tokens,
-                temperature=self.temperature
+                temperature=self.temperature,
+                workspace_filter=self.workspace_id  # Add RAG context if available
             )
         else:
             response = await self.client.chat(
@@ -131,7 +133,8 @@ async def _agenerate(
                 use_rag=self.use_rag,
                 system_prompt=system_prompt,
                 max_tokens=self.max_tokens,
-                temperature=self.temperature
+                temperature=self.temperature,
+                workspace_filter=self.workspace_id  # Add RAG context if available
             )
         
         message = AIMessage(content=response.response)
diff --git a/src/czero_engine/client.py b/src/czero_engine/client.py
index d77d2ee..b745731 100644
--- a/src/czero_engine/client.py
+++ b/src/czero_engine/client.py
@@ -11,7 +11,6 @@
 from .models import (
     ChatRequest, ChatResponse,
     SemanticSearchRequest, SemanticSearchResponse,
-    SimilaritySearchRequest, RecommendationsRequest,
     DocumentsResponse, DocumentMetadata, DocumentFullTextResponse,
     EmbeddingRequest, EmbeddingResponse,
     WorkspaceCreateRequest, WorkspaceResponse, WorkspaceListResponse, WorkspaceInfo,
@@ -197,72 +196,8 @@ async def semantic_search(
         response.raise_for_status()
         return SemanticSearchResponse(**response.json())
         
-    async def find_similar_chunks(
-        self,
-        chunk_id: str,
-        limit: int = 5,
-        similarity_threshold: float = 0.5
-    ) -> SemanticSearchResponse:
-        """
-        Find chunks similar to a given chunk ID.
-        
-        Useful for finding related content or duplicates.
-        
-        Args:
-            chunk_id: ID of the reference chunk
-            limit: Maximum number of results
-            similarity_threshold: Minimum similarity score
-            
-        Returns:
-            SemanticSearchResponse with similar chunks
-        """
-        request = SimilaritySearchRequest(
-            chunk_id=chunk_id,
-            limit=limit,
-            similarity_threshold=similarity_threshold
-        )
-        
-        self._log(f"Finding similar to chunk: {chunk_id}")
-        response = await self.client.post(
-            f"{self.base_url}/api/vector/search/similarity",
-            json=request.model_dump()
-        )
-        response.raise_for_status()
-        return SemanticSearchResponse(**response.json())
-        
-    async def get_recommendations(
-        self,
-        positive_chunk_ids: List[str],
-        negative_chunk_ids: Optional[List[str]] = None,
-        limit: int = 10
-    ) -> SemanticSearchResponse:
-        """
-        Get content recommendations based on positive/negative examples.
-        
-        Uses vector math to find content similar to positive examples
-        and dissimilar to negative examples.
-        
-        Args:
-            positive_chunk_ids: Chunk IDs to find similar content to
-            negative_chunk_ids: Chunk IDs to avoid similarity to
-            limit: Maximum number of recommendations
-            
-        Returns:
-            SemanticSearchResponse with recommended chunks
-        """
-        request = RecommendationsRequest(
-            positive_chunk_ids=positive_chunk_ids,
-            negative_chunk_ids=negative_chunk_ids or [],
-            limit=limit
-        )
-        
-        self._log(f"Getting recommendations based on {len(positive_chunk_ids)} positive examples")
-        response = await self.client.post(
-            f"{self.base_url}/api/vector/recommendations",
-            json=request.model_dump()
-        )
-        response.raise_for_status()
-        return SemanticSearchResponse(**response.json())
+    # Note: find_similar_chunks and get_recommendations methods have been deprecated
+    # Use semantic_search or hierarchical_retrieve for similar functionality
         
     # ==================== Document Management ====================
         
@@ -511,12 +446,14 @@ async def persona_chat(
         system_prompt_template: Optional[str] = None,
         conversation_history: Optional[List[Dict[str, str]]] = None,
         max_tokens: int = 1024,
-        temperature: float = 0.7
+        temperature: float = 0.7,
+        workspace_filter: Optional[str] = None
     ) -> PersonaChatResponse:
         """
         Chat with a specific AI persona.
         
         Each persona has its own personality, expertise, and interaction style.
+        Now supports RAG context when workspace_filter is provided.
         
         Args:
             persona_id: ID of the persona to chat with
@@ -526,6 +463,7 @@ async def persona_chat(
             conversation_history: Optional conversation history for context
             max_tokens: Maximum tokens to generate
             temperature: Temperature for generation
+            workspace_filter: Optional workspace ID for RAG context
             
         Returns:
             PersonaChatResponse with persona's response
@@ -537,7 +475,8 @@ async def persona_chat(
             system_prompt_template=system_prompt_template,
             conversation_history=conversation_history,
             max_tokens=max_tokens,
-            temperature=temperature
+            temperature=temperature,
+            workspace_filter=workspace_filter
         )
         
         self._log(f"Chatting with persona: {persona_id}")
diff --git a/src/czero_engine/models.py b/src/czero_engine/models.py
index 5139cd4..0c634ef 100644
--- a/src/czero_engine/models.py
+++ b/src/czero_engine/models.py
@@ -70,18 +70,8 @@ class SemanticSearchResponse(BaseModel):
     results: List[SearchResult]
 
 
-class SimilaritySearchRequest(BaseModel):
-    """Request model for /api/vector/search/similarity endpoint."""
-    chunk_id: str
-    limit: int = 5
-    similarity_threshold: float = 0.5
-
-
-class RecommendationsRequest(BaseModel):
-    """Request model for /api/vector/recommendations endpoint."""
-    positive_chunk_ids: List[str]
-    negative_chunk_ids: Optional[List[str]] = Field(default_factory=list)
-    limit: int = 10
+# Note: SimilaritySearchRequest and RecommendationsRequest have been deprecated
+# Use SemanticSearchRequest or HierarchicalRetrievalRequest instead
 
 
 # Document Models
@@ -205,6 +195,7 @@ class PersonaChatRequest(BaseModel):
     conversation_history: Optional[List[ConversationMessage]] = None
     max_tokens: Optional[int] = 1024
     temperature: Optional[float] = 0.7
+    workspace_filter: Optional[str] = None  # For RAG context
 
 
 class PersonaChatResponse(BaseModel):
diff --git a/tests/test_all_endpoints.py b/tests/test_all_endpoints.py
index 4c42392..e4fef8b 100644
--- a/tests/test_all_endpoints.py
+++ b/tests/test_all_endpoints.py
@@ -215,26 +215,8 @@ async def run_all_tests(self):
         )
         
         # Extract chunk_id for similarity search
-        if search_data and search_data.get("results"):
-            self.chunk_id = search_data["results"][0]["chunk_id"]
-            
-            # Similarity search
-            await self.test_endpoint(
-                "Similarity Search", "POST", "/api/vector/search/similarity",
-                {
-                    "chunk_id": self.chunk_id,
-                    "limit": 3
-                }
-            )
-            
-            # Recommendations
-            await self.test_endpoint(
-                "Get Recommendations", "POST", "/api/vector/recommendations",
-                {
-                    "positive_chunk_ids": [self.chunk_id],
-                    "limit": 5
-                }
-            )
+        # Note: Similarity search and recommendations endpoints are deprecated
+        # The new hierarchical retrieval system replaces these with direct query-based search
         
         # 7. Hierarchical Retrieval
         console.print("\n[bold yellow]═══ Hierarchical Retrieval ═══[/bold yellow]")
@@ -274,9 +256,9 @@ async def run_all_tests(self):
         if persona_data and persona_data.get("persona_id"):
             self.persona_id = persona_data["persona_id"]
             
-            # Chat with persona
+            # Chat with persona (without RAG)
             await self.test_endpoint(
-                "Persona Chat", "POST", "/api/personas/chat",
+                "Persona Chat (No RAG)", "POST", "/api/personas/chat",
                 {
                     "persona_id": self.persona_id,
                     "message": "Hello, test persona!",
@@ -284,6 +266,17 @@ async def run_all_tests(self):
                 }
             )
             
+            # Chat with persona (with RAG)
+            await self.test_endpoint(
+                "Persona Chat (With RAG)", "POST", "/api/personas/chat",
+                {
+                    "persona_id": self.persona_id,
+                    "message": "Based on the test document, what can you tell me?",
+                    "workspace_filter": self.workspace_id,
+                    "max_tokens": 100
+                }
+            )
+            
             # Delete persona
             await self.test_endpoint(
                 "Delete Persona", "DELETE", f"/api/personas/{self.persona_id}"