NeonGeckoCom · NeonDaniel · Jun 5, 2026 · May 4, 2026 · May 5, 2026 · May 5, 2026
diff --git a/multi_llm_chatbot_backend/app/api/routes/chat.py b/multi_llm_chatbot_backend/app/api/routes/chat.py
@@ -13,7 +13,7 @@
 from app.api.utils import get_or_create_session_for_request_async
 from app.core.auth import get_current_active_user
 from app.config import get_settings
-from app.core.bootstrap import chat_orchestrator
+from app.core.bootstrap import chat_orchestrator, get_llm_client
 from app.core.database import get_database
 from app.core.persona_filter import get_available_persona_ids
 from app.core.session_manager import get_session_manager
@@ -24,6 +24,41 @@
 router = APIRouter()
 session_manager = get_session_manager()
 
+
+def resolve_llm_clients(user: User) -> Dict[str, Any]:
+    """Resolve LLM clients from a user's stored configuration.
+
+    Returns ``{"orchestrator": LLMClient | None, "personas": {id: LLMClient} | None}``.
+
+    - No saved config: both values are ``None``; callers fall back to
+      orchestrator/persona defaults.
+    - Uniform mode: the same cached client is returned for the orchestrator
+      and every persona.
+    - Hybrid mode: the orchestrator and each persona may receive different
+      clients based on the user's per-persona mapping.
+    """
+    config = user.llm_config
+    if config is None:
+        return {"orchestrator": None, "personas": None}
+
+    if config.mode == "uniform":
+        client = get_llm_client(config.default_backend)
+        persona_clients = {
+            pid: client for pid in chat_orchestrator.personas
+        }
+        return {"orchestrator": client, "personas": persona_clients}
+
+    # Hybrid mode
+    orchestrator_backend = config.orchestrator_backend or config.default_backend
+    orchestrator_client = get_llm_client(orchestrator_backend)
+
+    persona_clients = {}
+    for pid in chat_orchestrator.personas:
+        backend = (config.persona_backends or {}).get(pid, config.default_backend)
+        persona_clients[pid] = get_llm_client(backend)
+
+    return {"orchestrator": orchestrator_client, "personas": persona_clients}
+
 # Enhanced data models
 class UserInput(BaseModel):
     user_input: str
@@ -81,6 +116,11 @@ async def chat_stream(
 
     async def _event_generator():
         try:
+            # Resolve per-user LLM clients from their stored config
+            llm_clients = resolve_llm_clients(current_user)
+            orchestrator_llm = llm_clients["orchestrator"]
+            persona_llms = llm_clients["personas"]
+
             # Load or create the in-memory session
             if message.chat_session_id:
                 sid = f"chat_{message.chat_session_id}"
@@ -107,7 +147,9 @@ async def _event_generator():
                 ).to_ndjson()
 
             if await chat_orchestrator.needs_clarification_improved(session, message.user_input):
-                clar = await chat_orchestrator.generate_contextual_clarification(message.user_input)
+                clar = await chat_orchestrator.generate_contextual_clarification(
+                    message.user_input, llm_client=orchestrator_llm,
+                )
                 yield ChatStreamLine(
                     type="clarification",
                     data={
@@ -123,7 +165,9 @@ async def _event_generator():
 
             # If an enabled tool can handle this query, return its response
             # directly and skip persona generation.
-            tool_result = await chat_orchestrator.get_tool_response(message.user_input)
+            tool_result = await chat_orchestrator.get_tool_response(
+                message.user_input, llm_client=orchestrator_llm,
+            )
             if tool_result.used_tool:
                 # Append user message to in-memory session and persist to MongoDB
                 session.append_message("orchestrator", tool_result.text)
@@ -164,6 +208,7 @@ async def _event_generator():
             top_personas = await chat_orchestrator.get_top_personas(
                 session_id=sid,
                 allowed_ids=available,
+                llm_client=orchestrator_llm,
             )
 
             # Guard against race condition where all selected advisors
@@ -210,9 +255,11 @@ async def _run(pid: str) -> None:
                             "document_chunks_used": 0,
                         })
                         return
+                    persona_llm = (persona_llms or {}).get(pid)
                     result = await chat_orchestrator.generate_single_persona_response(
                         session, persona,
                         message.response_length or "medium",
+                        llm_client=persona_llm,
                     )
                     session.append_message(pid, result["response"])
                     await done_queue.put(result)
@@ -390,7 +437,10 @@ async def create_new_chat(
         raise HTTPException(status_code=500, detail="Failed to create new chat")
 
 @router.post("/chat/{persona_id}")
-async def chat_with_specific_advisor(persona_id: str, input: UserInput, request: Request):
+async def chat_with_specific_advisor(
+    persona_id: str, input: UserInput, request: Request,
+    current_user: User = Depends(get_current_active_user),
+):
     """Chat with a specific advisor - UPDATED"""
     try:
         if persona_id not in chat_orchestrator.personas:
@@ -408,11 +458,15 @@ async def chat_with_specific_advisor(persona_id: str, input: UserInput, request:
                     isExpandRequest=True,
                 ),
             )
+
+        llm_clients = resolve_llm_clients(current_user)
+        persona_llm = (llm_clients["personas"] or {}).get(persona_id)
 
         result = await chat_orchestrator.chat_with_persona(
             user_input=input.user_input,
             persona_id=persona_id,
-            session_id=session_id
+            session_id=session_id,
+            llm_client=persona_llm,
         )
 
         # Handle response structure
@@ -479,7 +533,10 @@ async def chat_with_specific_advisor(persona_id: str, input: UserInput, request:
         }
 
 @router.post("/reply-to-advisor")
-async def reply_to_advisor(reply: ReplyToAdvisor, request: Request):
+async def reply_to_advisor(
+    reply: ReplyToAdvisor, request: Request,
+    current_user: User = Depends(get_current_active_user),
+):
     """Reply to a specific advisor with proper context - UPDATED"""
     try:
         if reply.advisor_id not in chat_orchestrator.personas:
@@ -520,10 +577,14 @@ async def reply_to_advisor(reply: ReplyToAdvisor, request: Request):
         if original_message:
             contextual_input = f"[Replying to your previous message: '{original_message[:100]}...'] {reply.user_input}"
 
+        llm_clients = resolve_llm_clients(current_user)
+        advisor_llm = (llm_clients["personas"] or {}).get(reply.advisor_id)
+
         result = await chat_orchestrator.chat_with_persona(
             user_input=contextual_input,
             persona_id=reply.advisor_id,
-            session_id=session_id
+            session_id=session_id,
+            llm_client=advisor_llm,
         )
 
         # Handle response structure
@@ -600,15 +661,22 @@ async def reply_to_advisor(reply: ReplyToAdvisor, request: Request):
         }
 
 @router.post("/ask/")
-async def ask_question(query: PersonaQuery, request: Request):
+async def ask_question(
+    query: PersonaQuery, request: Request,
+    current_user: User = Depends(get_current_active_user),
+):
     """Ask question - UPDATED"""
     try:
         session_id = await get_or_create_session_for_request_async(request)
 
+        llm_clients = resolve_llm_clients(current_user)
+        persona_llm = (llm_clients["personas"] or {}).get(query.persona)
+
         result = await chat_orchestrator.chat_with_persona(
             user_input=query.question,
             persona_id=query.persona,
-            session_id=session_id
+            session_id=session_id,
+            llm_client=persona_llm,
         )
 
         if result["type"] == "single_persona_response":

diff --git a/multi_llm_chatbot_backend/app/api/routes/provider.py b/multi_llm_chatbot_backend/app/api/routes/provider.py
@@ -1,108 +1,72 @@
-from fastapi import APIRouter, Body, HTTPException
-from app.config import get_settings
-from app.llm.improved_gemini_client import ImprovedGeminiClient
-from app.llm.improved_ollama_client import ImprovedOllamaClient
-from app.llm.improved_vllm_client import ImprovedVllmClient
-from app.models.default_personas import get_default_personas
-from app.core.bootstrap import chat_orchestrator, llm, current_provider, available_providers
-from app.core.brainforge_sync import BRAINFORGE_PERSONA_PREFIX
-from pydantic import BaseModel
-import os
+from fastapi import APIRouter, Depends, HTTPException, status
+from app.core.auth import get_current_active_user
+from app.core.bootstrap import (
+    chat_orchestrator, get_llm_client, AVAILABLE_BACKENDS, _is_backend_enabled,
+)
+from app.core.database import get_database
+from app.models.user import User, UserLLMConfig
 import logging
 
 logger = logging.getLogger(__name__)
 
 router = APIRouter()
 
-def create_llm_client(provider: str = None):
-    global current_provider
-    if provider is None:
-        provider = current_provider
-
-    if provider == "gemini":
-        try:
-            return ImprovedGeminiClient(model_name=os.getenv("GEMINI_MODEL"))
-        except ValueError as e:
-            logger.warning(f"Gemini API key not found, falling back to Ollama: {e}")
-            return ImprovedOllamaClient(model_name="llama3.2:1b")
-    elif provider == "ollama":
-        return ImprovedOllamaClient(model_name="llama3.2:1b")
-    elif provider == "vllm":
-        settings = get_settings()
-        if not settings.llm.vllm.api_url:
-            raise ValueError("No vLLM endpoint configured. Set llm.vllm.api_url in your config.")
-        return ImprovedVllmClient(
-            api_url=settings.llm.vllm.api_url,
-            api_key=settings.llm.vllm.api_key,
-        )
-    else:
-        raise ValueError(f"Unknown provider: {provider}")
-
-# Initialize LLM and personas
-llm = create_llm_client(current_provider)
-DEFAULT_PERSONAS = get_default_personas(llm)
-for persona in DEFAULT_PERSONAS:
-    chat_orchestrator.register_persona(persona)
-
-class ProviderSwitch(BaseModel):
-    provider: str
 
 @router.get("/current-provider")
-async def get_current_provider():
+async def get_current_provider(
+    current_user: User = Depends(get_current_active_user),
+):
+    """Return the authenticated user's LLM configuration."""
+    config = current_user.llm_config or UserLLMConfig()
     return {
-        "current_provider": current_provider,
-        "available_providers": available_providers,
-        "model_info": {
-            "name": llm.model_name if hasattr(llm, 'model_name') else "gemini-2.0-flash",
-            "provider": current_provider
-        }
+        "llm_config": config.model_dump(),
+        "available_backends": AVAILABLE_BACKENDS,
     }
 
-@router.post("/switch-provider")
-async def switch_provider(provider_data: ProviderSwitch):
-    global current_provider, llm
-
-    if provider_data.provider not in available_providers:
-        raise HTTPException(status_code=400, detail=f"Unknown provider: {provider_data.provider}. Available: {available_providers}")
-
-    try:
-        current_provider = provider_data.provider
-        new_llm = create_llm_client(current_provider)
-        llm = new_llm
 
-        chat_orchestrator.llm_client = new_llm
-
-        new_personas = get_default_personas(new_llm)
-        # Clear only non-BrainForge personas; BF advisors have their own LLM clients
-        non_bf_ids = [pid for pid in chat_orchestrator.personas if not pid.startswith(f"{BRAINFORGE_PERSONA_PREFIX}_")]
-        for pid in non_bf_ids:
-            chat_orchestrator.unregister_persona(pid)
-        for persona in new_personas:
-            chat_orchestrator.register_persona(persona)
-
-        return {
-            "message": f"Successfully switched to {current_provider}",
-            "current_provider": current_provider,
-            "model_info": {
-                "name": new_llm.model_name if hasattr(new_llm, 'model_name') else "gemini-2.0-flash",
-                "provider": current_provider
-            }
-        }
-
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to switch to {provider_data.provider}: {str(e)}")
-
-@router.post("/switch-model")
-async def switch_model(model_name: str = Body(...)):
-    if "gemini" in model_name.lower():
-        return await switch_provider(ProviderSwitch(provider="gemini"))
-    else:
-        return await switch_provider(ProviderSwitch(provider="ollama"))
+@router.post("/switch-provider")
+async def switch_provider(
+    llm_config: UserLLMConfig,
+    current_user: User = Depends(get_current_active_user),
+):
+    """Persist the user's LLM configuration to their profile."""
+    if llm_config.mode == "hybrid" and llm_config.persona_backends:
+        registered = set(chat_orchestrator.personas.keys())
+        unknown = set(llm_config.persona_backends.keys()) - registered
+        if unknown:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Unknown persona IDs: {sorted(unknown)}. "
+                       f"Valid IDs: {sorted(registered)}",
+            )
+
+    backends_to_check = {llm_config.default_backend}
+    if llm_config.orchestrator_backend:
+        backends_to_check.add(llm_config.orchestrator_backend)
+    if llm_config.persona_backends:
+        backends_to_check.update(llm_config.persona_backends.values())
+
+    for backend in backends_to_check:
+        if not _is_backend_enabled(backend):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Backend {backend!r} is disabled by the administrator.",
+            )
+        try:
+            get_llm_client(backend)
+        except Exception as exc:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Backend {backend!r} is not configured: {exc}",
+            )
+
+    db = get_database()
+    await db.users.update_one(
+        {"_id": current_user.id},
+        {"$set": {"llm_config": llm_config.model_dump()}},
+    )
 
-@router.get("/current-model")
-async def get_current_model():
-    model_name = llm.model_name if hasattr(llm, 'model_name') else "gemini-2.0-flash"
     return {
-        "model": model_name,
-        "provider": current_provider
+        "message": "LLM configuration updated",
+        "llm_config": llm_config.model_dump(),
     }
diff --git a/multi_llm_chatbot_backend/app/config.py b/multi_llm_chatbot_backend/app/config.py
@@ -253,6 +253,7 @@ def _warn_connection_envvar(self):
 
 
 class GeminiConfig(BaseModel):
+    enabled: bool = True
     api_key: str = Field(default=os.getenv("GEMINI_API_KEY"))
     model: str = "gemini-2.5-flash"
 
@@ -272,12 +273,14 @@ def _warn_gemini_envvar(self):
 
 
 class OllamaConfig(BaseModel):
+    enabled: bool = True
     model: str = "llama3.2:1b"
     # TODO: Drop support for `OLLAMA_BASE_URL` envvar handling
     base_url: str = Field(default=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"))
 
 
 class VllmConfig(BaseModel):
+    enabled: bool = True
     api_url: str = ""
     api_key: str = Field(default=os.getenv("VLLM_API_KEY", ""))
 
@@ -290,10 +293,12 @@ class BrainForgeConfig(BaseModel):
 
 
 class LLMConfig(BaseModel):
+    default_backend: str = ""
     gemini: GeminiConfig = GeminiConfig()
     ollama: OllamaConfig = OllamaConfig()
     vllm: VllmConfig = VllmConfig()
     brainforge: BrainForgeConfig = BrainForgeConfig()
+    health_check_interval_seconds: int = 300
 
 
 class RAGConfig(BaseModel):