diff --git a/agents/customer_explorer_agent.py b/agents/customer_explorer_agent.py
index a1d8adc..e1d5ab3 100644
--- a/agents/customer_explorer_agent.py
+++ b/agents/customer_explorer_agent.py
@@ -6,9 +6,11 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 from pydantic import BaseModel
 from datetime import datetime, timedelta
 import json
+import agent_config
 
 dotenv.load_dotenv()
 
@@ -116,12 +118,9 @@ def create_customer_explorer_agent() -> Agent:
     """Create and return the customer explorer agent"""
     return Agent(
         name="CustomerExplorerAgent",
-        model=OpenAIChat(
-            id="gpt-4o-mini",
-            api_key=os.getenv("OPENAI_API_KEY"),
-        ),
+        model=agent_config.get_model(),
         description=CUSTOMER_EXPLORER_SYSTEM_PROMPT,
-        add_history_to_context=False,
+        add_history_to_context=True,
     )
 
 def explore_customer_context(
diff --git a/agents/next_message_agent.py b/agents/next_message_agent.py
index 90e956d..edaa70d 100644
--- a/agents/next_message_agent.py
+++ b/agents/next_message_agent.py
@@ -6,7 +6,9 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 from pydantic import BaseModel
+import agent_config
 
 dotenv.load_dotenv()
 
@@ -131,12 +133,9 @@ def create_next_message_agent() -> Agent:
     """Create and return the next message agent"""
     return Agent(
         name="NextMessageAgent",
-        model=OpenAIChat(
-            id="gpt-4o-mini",
-            api_key=os.getenv("OPENAI_API_KEY"),
-        ),
+        model=agent_config.get_model(),
         description=NEXT_MESSAGE_SYSTEM_PROMPT,
-        add_history_to_context=False,
+        add_history_to_context=True,
     )
 
 def suggest_next_message(
diff --git a/agents/summary_agent.py b/agents/summary_agent.py
index 272871b..3e4b46a 100644
--- a/agents/summary_agent.py
+++ b/agents/summary_agent.py
@@ -6,7 +6,9 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 from pydantic import BaseModel
+import agent_config
 
 dotenv.load_dotenv()
 
@@ -46,12 +48,9 @@ def create_summary_agent() -> Agent:
     """Create and return the summary agent"""
     return Agent(
         name="SummaryAgent",
-        model=OpenAIChat(
-            id="gpt-4o-mini",
-            api_key=os.getenv("OPENAI_API_KEY"),
-        ),
+        model=agent_config.get_model(),
         description=SUMMARY_SYSTEM_PROMPT,
-        add_history_to_context=False,  # Each call is independent
+        add_history_to_context=True,
     )
 
 def summarize_conversation(messages: List[Dict[str, Any]]) -> MessageSummary:
diff --git a/main_support_agent.py b/main_support_agent.py
index faa8a26..c21ddd4 100644
--- a/main_support_agent.py
+++ b/main_support_agent.py
@@ -10,6 +10,7 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 
 # Import our specialized agents as tools
 from agents.summary_agent import summarize_conversation
@@ -179,9 +180,13 @@ def escalate_to_human(reason: str, urgency: str = "medium") -> str:
 # Create the main support agent
 support_agent = Agent(
     name="BankCustomerSupportAgent",
-    model=OpenAIChat(
-        id="gpt-4o-mini",
-        api_key=os.getenv("OPENAI_API_KEY"),
+    # model=OpenAIChat(
+    #     id="gpt-4o-mini",
+    #     api_key=os.getenv("OPENAI_API_KEY"),
+    # ),
+    model=Nebius(
+        id="openai/gpt-oss-120b",
+        api_key=os.getenv("NEBIUS_API_KEY"),
     ),
     tools=[
         get_conversation_summary,
diff --git a/main_support_agent_claude.py b/main_support_agent_claude.py
new file mode 100644
index 0000000..61fdb79
--- /dev/null
+++ b/main_support_agent_claude.py
@@ -0,0 +1,230 @@
+"""
+Main Bank Customer Support Agent - Claude Sonnet 4.5 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.anthropic import Claude
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Claude(id="claude-opus-4-6", api_key=os.getenv("ANTHROPIC_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Claude(
+        id="claude-opus-4-6",
+        api_key=os.getenv("ANTHROPIC_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,
+    num_history_runs=100,
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_deepseek.py b/main_support_agent_deepseek.py
new file mode 100644
index 0000000..d57e791
--- /dev/null
+++ b/main_support_agent_deepseek.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - DeepSeek V3.2 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="deepseek-ai/DeepSeek-V3.2", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="deepseek-ai/DeepSeek-V3.2",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_glm.py b/main_support_agent_glm.py
new file mode 100644
index 0000000..2440441
--- /dev/null
+++ b/main_support_agent_glm.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - GLM-4.7-FP8 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="zai-org/GLM-4.7-FP8", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="zai-org/GLM-4.7-FP8",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_minimax.py b/main_support_agent_minimax.py
new file mode 100644
index 0000000..55a05fe
--- /dev/null
+++ b/main_support_agent_minimax.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - MiniMax-M2.1 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="MiniMaxAI/MiniMax-M2.1", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="MiniMaxAI/MiniMax-M2.1",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_openai.py b/main_support_agent_openai.py
new file mode 100644
index 0000000..259167c
--- /dev/null
+++ b/main_support_agent_openai.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - OpenAI GPT-OSS-120B Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="openai/gpt-oss-120b", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="openai/gpt-oss-120b",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/tests-demo/__init__.py b/tests-demo/__init__.py
new file mode 100644
index 0000000..26bd4a3
--- /dev/null
+++ b/tests-demo/__init__.py
@@ -0,0 +1,3 @@
+"""
+Demo tests for bank customer support agent across different models
+"""
diff --git a/tests-demo/test_demo_claude.py b/tests-demo/test_demo_claude.py
new file mode 100644
index 0000000..d21b33c
--- /dev/null
+++ b/tests-demo/test_demo_claude.py
@@ -0,0 +1,427 @@
+"""
+Tests for the main bank customer support agent - Claude Sonnet 4.5 Model
+
+These tests cover real business scenarios and validate tool calling behavior
+using Claude Sonnet 4.5 model for evaluation.
+"""
+import asyncio
+import pytest
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent_claude import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
+
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Use synthetic tool trace messages — these properly pair tool calls
+        # with their results, avoiding missing tool_call_id issues with
+        # Claude's toolu_* IDs
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
+
+        # Fallback to content if no tool calls
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    result = await scenario.run(
+        name="fraud investigation and card security - Claude",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Fraud investigation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = _parse_tool_arguments(tool_call)
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
+
+    result = await scenario.run(
+        name="customer escalation to human agent - Claude",
+        description="""
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Escalation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - Claude",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Complex issue test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - Claude",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - Claude",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - Claude",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - Claude",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - Claude",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py
new file mode 100644
index 0000000..6133a40
--- /dev/null
+++ b/tests-demo/test_demo_deepseek.py
@@ -0,0 +1,451 @@
+"""
+Tests for the main bank customer support agent - DeepSeek Model
+
+These tests cover real business scenarios and validate tool calling behavior
+using Nebius DeepSeek-V3.2 model for evaluation.
+"""
+import asyncio
+import pytest
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent_deepseek import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
+
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    result = await scenario.run(
+        name="fraud investigation and card security - DeepSeek",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Fraud investigation test")
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = _parse_tool_arguments(tool_call)
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
+
+    result = await scenario.run(
+        name="customer escalation to human agent - DeepSeek",
+        description="""
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Escalation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - DeepSeek",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Complex issue test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - DeepSeek",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - DeepSeek",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - DeepSeek",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - DeepSeek",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - DeepSeek",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py
new file mode 100644
index 0000000..056e782
--- /dev/null
+++ b/tests-demo/test_demo_glm.py
@@ -0,0 +1,453 @@
+"""
+Tests for the main bank customer support agent - GLM Model
+
+These tests cover real business scenarios and validate tool calling behavior
+using Nebius GLM-4.7-FP8 model for evaluation.
+"""
+import asyncio
+import pytest
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent_glm import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
+
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    result = await scenario.run(
+        name="fraud investigation and card security - GLM",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Fraud investigation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = _parse_tool_arguments(tool_call)
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
+
+    result = await scenario.run(
+        name="customer escalation to human agent - GLM",
+        description="""
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Escalation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - GLM",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Complex issue test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - GLM",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - GLM",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - GLM",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - GLM",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - GLM",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py
new file mode 100644
index 0000000..f2d481e
--- /dev/null
+++ b/tests-demo/test_demo_minimax.py
@@ -0,0 +1,453 @@
+"""
+Tests for the main bank customer support agent - MiniMax Model
+
+These tests cover real business scenarios and validate tool calling behavior
+using Nebius MiniMax-M2.1 model for evaluation.
+"""
+import asyncio
+import pytest
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent_minimax import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
+
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    result = await scenario.run(
+        name="fraud investigation and card security - MiniMax",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Fraud investigation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = _parse_tool_arguments(tool_call)
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
+
+    result = await scenario.run(
+        name="customer escalation to human agent - MiniMax",
+        description="""
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Escalation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - MiniMax",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Complex issue test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - MiniMax",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - MiniMax",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - MiniMax",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - MiniMax",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - MiniMax",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py
new file mode 100644
index 0000000..198175d
--- /dev/null
+++ b/tests-demo/test_demo_openai.py
@@ -0,0 +1,453 @@
+"""
+Tests for the main bank customer support agent - OpenAI Model
+
+These tests cover real business scenarios and validate tool calling behavior
+using OpenAI claude-sonnet-4-5-mini model for evaluation.
+"""
+import asyncio
+import pytest
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent_openai import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
+
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    result = await scenario.run(
+        name="fraud investigation and card security - OpenAI",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Fraud investigation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = _parse_tool_arguments(tool_call)
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
+
+    result = await scenario.run(
+        name="customer escalation to human agent - OpenAI",
+        description="""
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Escalation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - OpenAI",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Complex issue test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - OpenAI",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - OpenAI",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - OpenAI",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - OpenAI",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - OpenAI",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests/test_main_support_agent.py b/tests/test_main_support_agent.py
index 1376f81..d28f4ff 100644
--- a/tests/test_main_support_agent.py
+++ b/tests/test_main_support_agent.py
@@ -17,7 +17,7 @@
 from main_support_agent import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="openai/gpt-4o-mini")
+scenario.configure(default_model="nebius/openai/gpt-oss-120b")
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):