Onededios · Copilot · Nov 28, 2025 · Nov 28, 2025
diff --git a/python/README.md b/python/README.md
@@ -8,7 +8,7 @@ This module contains the Python components for the IOC.EAssistant project, inclu
 - Python 3.7 or higher
 - pip package manager
 
-### Install requiments
+### Install requirements
 
 ```bash
 cd python
@@ -122,11 +122,11 @@ The system supports two model providers:
 
 ```bash
 # Start the web server
-python web.py
+python app.py
 ```
 
-The server will start on `http://localhost:8000` and provide:
-- **Swagger UI**: `http://localhost:8000/apidocs/` for interactive API documentation
+The server will start on `http://localhost:8080` and provide:
+- **Swagger UI**: `http://localhost:8080/apidocs/` for interactive API documentation
 - **RESTful API**: Endpoints for RAG-powered chat with conversation history
 
 #### API Endpoints
@@ -139,7 +139,7 @@ Check if the service is running and get model information.
 
 **Example:**
 ```bash
-curl http://localhost:8000/health
+curl http://localhost:8080/health
 ```
 
 **Response:**
@@ -253,7 +253,7 @@ python/
 ├── rag_agent.py               # RAG Agent with LangChain (stateless)
 ├── utils.py                   # Utility functions (GPU config, formatting)
 ├── vectorize_documents.py     # Document vectorization script
-├── web.py                     # Flask API server (stateless)
+├── app.py                     # Flask API server (stateless)
 ├── requirements.txt           # Python dependencies
 ├── README.md                  # This file
 ├── data/                      # Crawled data storage (JSON files)

diff --git a/python/app.py b/python/app.py
@@ -4,8 +4,8 @@
 import os
 import sys
 import subprocess
-import asyncio
 from datetime import datetime
+import tiktoken
 
 # --- Flask + Swagger setup ---
 app = Flask(__name__)
@@ -32,24 +32,28 @@ def check_and_setup_data():
     # If data doesn't exist, run crawler
     if not data_exists:
       print("crawling data...")
-      subprocess.run(
+      result = subprocess.run(
           [sys.executable, "crawler.py"],
           cwd=os.path.dirname(os.path.abspath(__file__)),
           capture_output=True,
           text=True,
           timeout=600  # 10 minutes timeout
       )
+      if result.returncode != 0:
+          print(f"Error running crawler.py: {result.stderr}", file=sys.stderr)
 
     # If ChromaDB doesn't exist, run vectorize_documents
     if not chroma_db_exists:
       print("vectorizing documents...")
-      subprocess.run(
+      result = subprocess.run(
           [sys.executable, "vectorize_documents.py"],
           cwd=os.path.dirname(os.path.abspath(__file__)),
           capture_output=True,
           text=True,
           timeout=600  # 10 minutes timeout
       )
+      if result.returncode != 0:
+          print(f"Error running vectorize_documents.py: {result.stderr}", file=sys.stderr)
 
 # --- Check and setup data before initializing RAG Agent ---
 print("Checking prerequisites...")
@@ -172,14 +176,30 @@ def chat():
               type: string
     """
     try:
-        data = request.get_json(force=True)
+        if not request.is_json:
+            return jsonify({"error": "Content-Type must be application/json"}), 400
+        data = request.get_json()
         messages = data.get("messages", [])
         model_config = data.get("modelConfig", {})
-        metadata = data.get("metadata", {})
 
         if not messages:
             return jsonify({"error": "messages field required"}), 400
 
+        # Input validation: limit conversation history length to prevent abuse
+        MAX_MESSAGES = 50
+        MAX_CONTENT_LENGTH = 100000  # 100KB total content
+
+        if len(messages) > MAX_MESSAGES:
+            return jsonify({"error": f"Maximum {MAX_MESSAGES} messages allowed in conversation history"}), 400
+
+        # Calculate total content length
+        total_content = sum(
+            len(msg.get("question", "")) + len(msg.get("answer", ""))
+            for msg in messages
+        )
+        if total_content > MAX_CONTENT_LENGTH:
+            return jsonify({"error": f"Total content length exceeds {MAX_CONTENT_LENGTH} characters"}), 400
+
         # Get the last message (current question)
         last_message = messages[-1]
         current_question = last_message.get("question", "").strip()
@@ -209,9 +229,16 @@ def chat():
         end_time = datetime.now()
         processing_time = int((end_time - start_time).total_seconds() * 1000)
 
-        # Estimate token usage (rough approximation)
-        prompt_tokens = sum(len(q.split()) + len(a.split()) for q, a in conversation_history) + len(current_question.split())
-        completion_tokens = len(answer.split())
+        # Use tiktoken for accurate token counting
+        try:
+            encoding = tiktoken.encoding_for_model("gpt-4")
+        except KeyError:
+            encoding = tiktoken.get_encoding("cl100k_base")
+
+        # Calculate prompt tokens from conversation history and current question
+        prompt_text = "".join(q + a for q, a in conversation_history) + current_question
+        prompt_tokens = len(encoding.encode(prompt_text))
+        completion_tokens = len(encoding.encode(answer))
         total_tokens = prompt_tokens + completion_tokens
 
         # Return response in the expected format

diff --git a/python/crawler.py b/python/crawler.py
@@ -174,7 +174,7 @@ async def extract_page_content(self, page):
         page_data = {
             "title": title_text,
             "content": content_text,
-            "type": "noticia" if "latest-news" in page else "general",
+            "type": "noticia" if "latest-news" in self.page.url else "general",
         }
 
         os.makedirs("data", exist_ok=True)

diff --git a/python/rag_agent.py b/python/rag_agent.py
@@ -147,9 +147,9 @@ def retrieve_general_context(query: str):
                         k=k,
                         filter={"type": "general"},
                     )
-                    # For some backends higher score is better; if None, keep the doc
+                    # For ChromaDB, lower distance scores are better (0 = identical)
                     retrieved_docs = [
-                        doc for doc, score in with_scores if (score is None or score >= score_threshold)
+                        doc for doc, score in with_scores if (score is None or score <= score_threshold)
                     ]
             except Exception:
                 # Fallback to basic similarity search
@@ -189,8 +189,9 @@ def retrieve_noticia_context(query: str):
                         k=k,
                         filter={"type": "noticia"},
                     )
+                    # For ChromaDB, lower distance scores are better (0 = identical)
                     retrieved_docs = [
-                        doc for doc, score in with_scores if (score is None or score >= score_threshold)
+                        doc for doc, score in with_scores if (score is None or score <= score_threshold)
                     ]
             except Exception:
                 retrieved_docs = vector_store.similarity_search(
@@ -266,6 +267,77 @@ def _initialize_agent(self) -> None:
             print("Agent mode not supported, langchain version may be outdated.")
             self.use_agent = False
 
+    # ---------------------------- Helper methods ----------------------------
+    def _fallback_retrieval(self, question: str, verbose: bool = True) -> list:
+        """
+        Fallback retrieval method using diversified retrieval across both document types.
+
+        Args:
+            question: The question to search for
+            verbose: Whether to print debug information
+
+        Returns:
+            List of retrieved documents
+        """
+        ctx_docs = []
+        try:
+            ctx_docs += self.vector_store.max_marginal_relevance_search(
+                question,
+                k=self.k_results,
+                fetch_k=max(20, self.k_results * self.fetch_k_multiplier),
+                filter={"type": "general"},
+            )
+        except Exception:
+            ctx_docs += self.vector_store.similarity_search(
+                question, k=self.k_results
+            )
+        try:
+            ctx_docs += self.vector_store.max_marginal_relevance_search(
+                question,
+                k=self.k_results,
+                fetch_k=max(20, self.k_results * self.fetch_k_multiplier),
+                filter={"type": "noticia"},
+            )
+        except Exception as e:
+            if verbose:
+                print(f"Noticia MMR search failed, skipping: {e}")
+
+        return ctx_docs
+
+    def _create_llm_with_temperature(self, temperature: float):
+        """
+        Create a new LLM instance with the specified temperature.
+        This is thread-safe compared to modifying the LLM's temperature attribute.
+
+        Args:
+            temperature: The temperature to use for the new LLM instance
+
+        Returns:
+            A new LLM instance with the specified temperature
+        """
+        if self.provider == "openai":
+            api_key = os.getenv("OPENAI_API_KEY")
+            return ChatOpenAI(
+                model=self.llm.model_name,
+                temperature=temperature,
+                openai_api_key=api_key,
+            )
+        elif self.provider == "ollama":
+            try:
+                import torch
+                num_gpu_param = -1 if torch.cuda.is_available() else 0
+            except Exception:
+                num_gpu_param = 0
+            return ChatOllama(
+                model=self.llm.model,
+                temperature=temperature,
+                num_gpu=num_gpu_param,
+                num_ctx=getattr(self.llm, 'num_ctx', 8192),
+            )
+        else:
+            # Fallback to the existing LLM if provider is unknown
+            return self.llm
+
     # ---------------------------- Query path -------------------------------
     def query(self, question: str, verbose: bool = True) -> str:
         """
@@ -281,28 +353,8 @@ def query(self, question: str, verbose: bool = True) -> str:
             if verbose:
                 print(f"Agent invocation failed, using simple RAG fallback: {e}")
 
-            # Simple RAG fallback: attempt diversified retrieval across both types
-            ctx_docs = []
-            try:
-                ctx_docs += self.vector_store.max_marginal_relevance_search(
-                    question,
-                    k=self.k_results,
-                    fetch_k=max(20, self.k_results * self.fetch_k_multiplier),
-                    filter={"type": "general"},
-                )
-            except Exception:
-                ctx_docs += self.vector_store.similarity_search(
-                    question, k=self.k_results
-                )
-            try:
-                ctx_docs += self.vector_store.max_marginal_relevance_search(
-                    question,
-                    k=self.k_results,
-                    fetch_k=max(20, self.k_results * self.fetch_k_multiplier),
-                    filter={"type": "noticia"},
-                )
-            except Exception as e:
-                print(f"Noticia MMR search failed, skipping: {e}")
+            # Use the helper method for fallback retrieval
+            ctx_docs = self._fallback_retrieval(question, verbose)
 
             context_blob = "\n\n".join(
                 [f"Source: {d.metadata}\nContent: {d.page_content}" for d in ctx_docs]
@@ -343,11 +395,8 @@ def query_with_history(
         # Current question
         messages.append(HumanMessage(content=question))
 
-        # Temporarily override temperature if provided
-        original_temp = None
-        if temperature is not None:
-            original_temp = self.llm.temperature
-            self.llm.temperature = temperature
+        # Create a new LLM with custom temperature if provided (thread-safe)
+        llm_to_use = self._create_llm_with_temperature(temperature) if temperature is not None else self.llm
 
         try:
             response = self.agent.invoke({"messages": messages})
@@ -356,42 +405,16 @@ def query_with_history(
             if verbose:
                 print(f"Agent invocation failed, using simple RAG fallback: {e}")
 
-            # Simple RAG fallback: attempt diversified retrieval across both types
-            ctx_docs = []
-            try:
-                ctx_docs += self.vector_store.max_marginal_relevance_search(
-                    question,
-                    k=self.k_results,
-                    fetch_k=max(20, self.k_results * self.fetch_k_multiplier),
-                    filter={"type": "general"},
-                )
-            except Exception:
-                ctx_docs += self.vector_store.similarity_search(
-                    question, k=self.k_results
-                )
-            try:
-                ctx_docs += self.vector_store.max_marginal_relevance_search(
-                    question,
-                    k=self.k_results,
-                    fetch_k=max(20, self.k_results * self.fetch_k_multiplier),
-                    filter={"type": "noticia"},
-                )
-            except Exception:
-                # Failure to retrieve "noticia" type documents is non-fatal;
-                # fallback will proceed with whatever documents were retrieved.
-                pass
+            # Use the helper method for fallback retrieval
+            ctx_docs = self._fallback_retrieval(question, verbose)
 
             context_blob = "\n\n".join(
                 [f"Source: {d.metadata}\nContent: {d.page_content}" for d in ctx_docs]
             )
 
             # Build simple prompt with context
             prompt = f"Context:\n{context_blob}\n\nQuestion: {question}\n\nAnswer:"
-            response_text = self.llm.invoke(prompt).content
-        finally:
-            # Restore original temperature if it was overridden
-            if original_temp is not None:
-                self.llm.temperature = original_temp
+            response_text = llm_to_use.invoke(prompt).content
 
         return response_text
 

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -16,5 +16,6 @@ jq===1.10.0
 langchain-ollama==1.0.0
 langchain-openai==1.0.2
 openai==2.7.2
+tiktoken==0.7.0
 duckduckgo-search==8.1.1
 waitress==3.0.2
diff --git a/python/vectorize_documents.py b/python/vectorize_documents.py
@@ -197,8 +197,8 @@ def vectorize_and_persist(
 
     # Use optimized text splitter with separators that respect document structure
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-        chunk_size=700,
-        chunk_overlap=120,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
         separators=["\n\n", "\n", ". ", " ", ""],
     )