From 0b9a637003f7fc54b11513e1cf347875ff89d1d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rog=C3=A9rio=20Chaves?=
 <rogeriochaves@users.noreply.github.com>
Date: Sat, 6 Dec 2025 09:30:23 +0100
Subject: [PATCH 1/6] migrate from openai to nebius

---
 agents/customer_explorer_agent.py | 11 ++++++++---
 agents/next_message_agent.py      | 11 ++++++++---
 agents/summary_agent.py           | 11 ++++++++---
 main_support_agent.py             | 11 ++++++++---
 tests/test_main_support_agent.py  |  2 +-
 5 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/agents/customer_explorer_agent.py b/agents/customer_explorer_agent.py
index a1d8adc..f23bbda 100644
--- a/agents/customer_explorer_agent.py
+++ b/agents/customer_explorer_agent.py
@@ -6,6 +6,7 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 from pydantic import BaseModel
 from datetime import datetime, timedelta
 import json
@@ -116,9 +117,13 @@ def create_customer_explorer_agent() -> Agent:
     """Create and return the customer explorer agent"""
     return Agent(
         name="CustomerExplorerAgent",
-        model=OpenAIChat(
-            id="gpt-4o-mini",
-            api_key=os.getenv("OPENAI_API_KEY"),
+        # model=OpenAIChat(
+        #     id="gpt-4o-mini",
+        #     api_key=os.getenv("OPENAI_API_KEY"),
+        # ),
+        model=Nebius(
+            id="openai/gpt-oss-120b",
+            api_key=os.getenv("NEBIUS_API_KEY"),
         ),
         description=CUSTOMER_EXPLORER_SYSTEM_PROMPT,
         add_history_to_context=False,
diff --git a/agents/next_message_agent.py b/agents/next_message_agent.py
index 90e956d..1fbf194 100644
--- a/agents/next_message_agent.py
+++ b/agents/next_message_agent.py
@@ -6,6 +6,7 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 from pydantic import BaseModel
 
 dotenv.load_dotenv()
@@ -131,9 +132,13 @@ def create_next_message_agent() -> Agent:
     """Create and return the next message agent"""
     return Agent(
         name="NextMessageAgent",
-        model=OpenAIChat(
-            id="gpt-4o-mini",
-            api_key=os.getenv("OPENAI_API_KEY"),
+        # model=OpenAIChat(
+        #     id="gpt-4o-mini",
+        #     api_key=os.getenv("OPENAI_API_KEY"),
+        # ),
+        model=Nebius(
+            id="openai/gpt-oss-120b",
+            api_key=os.getenv("NEBIUS_API_KEY"),
         ),
         description=NEXT_MESSAGE_SYSTEM_PROMPT,
         add_history_to_context=False,
diff --git a/agents/summary_agent.py b/agents/summary_agent.py
index 272871b..4002b96 100644
--- a/agents/summary_agent.py
+++ b/agents/summary_agent.py
@@ -6,6 +6,7 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 from pydantic import BaseModel
 
 dotenv.load_dotenv()
@@ -46,9 +47,13 @@ def create_summary_agent() -> Agent:
     """Create and return the summary agent"""
     return Agent(
         name="SummaryAgent",
-        model=OpenAIChat(
-            id="gpt-4o-mini",
-            api_key=os.getenv("OPENAI_API_KEY"),
+        # model=OpenAIChat(
+        #     id="gpt-4o-mini",
+        #     api_key=os.getenv("OPENAI_API_KEY"),
+        # ),
+        model=Nebius(
+            id="openai/gpt-oss-120b",
+            api_key=os.getenv("NEBIUS_API_KEY"),
         ),
         description=SUMMARY_SYSTEM_PROMPT,
         add_history_to_context=False,  # Each call is independent
diff --git a/main_support_agent.py b/main_support_agent.py
index faa8a26..c21ddd4 100644
--- a/main_support_agent.py
+++ b/main_support_agent.py
@@ -10,6 +10,7 @@
 import dotenv
 from agno.agent import Agent
 from agno.models.openai import OpenAIChat
+from agno.models.nebius import Nebius
 
 # Import our specialized agents as tools
 from agents.summary_agent import summarize_conversation
@@ -179,9 +180,13 @@ def escalate_to_human(reason: str, urgency: str = "medium") -> str:
 # Create the main support agent
 support_agent = Agent(
     name="BankCustomerSupportAgent",
-    model=OpenAIChat(
-        id="gpt-4o-mini",
-        api_key=os.getenv("OPENAI_API_KEY"),
+    # model=OpenAIChat(
+    #     id="gpt-4o-mini",
+    #     api_key=os.getenv("OPENAI_API_KEY"),
+    # ),
+    model=Nebius(
+        id="openai/gpt-oss-120b",
+        api_key=os.getenv("NEBIUS_API_KEY"),
     ),
     tools=[
         get_conversation_summary,
diff --git a/tests/test_main_support_agent.py b/tests/test_main_support_agent.py
index 1376f81..d28f4ff 100644
--- a/tests/test_main_support_agent.py
+++ b/tests/test_main_support_agent.py
@@ -17,7 +17,7 @@
 from main_support_agent import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="openai/gpt-4o-mini")
+scenario.configure(default_model="nebius/openai/gpt-oss-120b")
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):

From 3b8c1f75a2bb9ddcf5990f5a6a5689dacd0b3d6b Mon Sep 17 00:00:00 2001
From: aryansharma28 <aryansharma2k2@gmail.com>
Date: Mon, 9 Feb 2026 11:13:04 +0100
Subject: [PATCH 2/6] nebius demo

---
 tests-demo/__init__.py           |   3 +
 tests-demo/test_demo_deepseek.py | 179 +++++++++++++++++++++++++++++++
 tests-demo/test_demo_glm.py      | 179 +++++++++++++++++++++++++++++++
 tests-demo/test_demo_minimax.py  | 179 +++++++++++++++++++++++++++++++
 tests-demo/test_demo_openai.py   | 179 +++++++++++++++++++++++++++++++
 5 files changed, 719 insertions(+)
 create mode 100644 tests-demo/__init__.py
 create mode 100644 tests-demo/test_demo_deepseek.py
 create mode 100644 tests-demo/test_demo_glm.py
 create mode 100644 tests-demo/test_demo_minimax.py
 create mode 100644 tests-demo/test_demo_openai.py

diff --git a/tests-demo/__init__.py b/tests-demo/__init__.py
new file mode 100644
index 0000000..26bd4a3
--- /dev/null
+++ b/tests-demo/__init__.py
@@ -0,0 +1,3 @@
+"""
+Demo tests for bank customer support agent across different models
+"""
diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py
new file mode 100644
index 0000000..91a4060
--- /dev/null
+++ b/tests-demo/test_demo_deepseek.py
@@ -0,0 +1,179 @@
+"""
+Demo test showing proper Scenario usage with tool call validation - DeepSeek Model
+
+This demonstrates the key capabilities for the customer demo using Nebius DeepSeek model.
+"""
+import asyncio
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="nebius/deepseek-ai/DeepSeek-V3.2")
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+        return response.content
+
+async def test_fraud_with_tool_validation():
+    """
+    Demo test: Fraud investigation with proper tool call validation
+
+    This shows how Scenario can validate that the right tools are called
+    at the right time for specific business scenarios.
+    """
+
+    def check_fraud_tool_usage(state: scenario.ScenarioState):
+        """Custom assertion to verify fraud investigation tools were used"""
+        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
+
+        # Check if customer exploration was called
+        has_exploration = state.has_tool_call("explore_customer_account")
+        print(f"   explore_customer_account called: {has_exploration}")
+
+        if has_exploration:
+            tool_call = state.last_tool_call("explore_customer_account")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                print(f"   Tool arguments: {args}")
+
+                # Validate the arguments make sense for fraud
+                query = args.get("query", "").lower()
+                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
+                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
+                print(f"   Query contains fraud context: {has_fraud_context}")
+
+        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        # At least one appropriate tool should be called for fraud concerns
+        appropriate_response = has_exploration or has_escalation
+        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
+
+        return appropriate_response
+
+    print("🎭 Running fraud investigation demo with tool validation (DeepSeek)...")
+
+    result = await scenario.run(
+        name="fraud investigation demo - DeepSeek",
+        description="""
+            Customer reports suspicious transactions and potential fraud.
+            The agent should take this seriously and use appropriate tools
+            to investigate or escalate the security concern.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent takes fraud concerns seriously",
+                "Agent offers security measures or investigation",
+                "Agent maintains professional and reassuring tone"
+            ])
+        ],
+        script=[
+            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.agent(),
+            check_fraud_tool_usage,
+            scenario.user("Yes, please help me secure my account immediately!"),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    if result.reasoning:
+        print(f"Reasoning: {result.reasoning}")
+
+    return result.success
+
+async def test_escalation_detection():
+    """
+    Demo test: Escalation detection with custom validation
+    """
+
+    def verify_escalation_logic(state: scenario.ScenarioState):
+        """Check that angry customers trigger escalation"""
+        print(f"\n🚨 Checking escalation logic...")
+
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        if has_escalation:
+            tool_call = state.last_tool_call("escalate_to_human")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                reason = args.get("reason", "")
+                urgency = args.get("urgency", "medium")
+                print(f"   Escalation reason: {reason}")
+                print(f"   Urgency level: {urgency}")
+
+        print(f"   ✅ Escalation handled: {has_escalation}")
+        return has_escalation
+
+    print("\n🎭 Running escalation detection demo (DeepSeek)...")
+
+    result = await scenario.run(
+        name="escalation detection demo - DeepSeek",
+        description="""
+            Frustrated customer demands to speak with a manager.
+            Agent should recognize the escalation need and handle appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent acknowledges customer frustration",
+                "Agent offers escalation when demanded",
+                "Agent maintains professionalism"
+            ])
+        ],
+        script=[
+            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.agent(),
+            verify_escalation_logic,
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    return result.success
+
+async def main():
+    """Run the demo tests"""
+    print("🚀 Bank Customer Support Agent - Scenario Demo (DeepSeek)")
+    print("=" * 50)
+
+    # Test 1: Fraud Investigation
+    fraud_passed = await test_fraud_with_tool_validation()
+
+    # Test 2: Escalation Detection
+    escalation_passed = await test_escalation_detection()
+
+    print("\n" + "=" * 50)
+    print("📈 Demo Summary:")
+    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
+    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+
+    overall_success = fraud_passed and escalation_passed
+    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+
+    if overall_success:
+        print("\n🎉 Demo ready! This shows:")
+        print("   • Proper Scenario framework usage")
+        print("   • Tool calling validation")
+        print("   • Custom assertions for business logic")
+        print("   • Realistic user simulation")
+        print("   • Automated quality assessment")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py
new file mode 100644
index 0000000..cc267c6
--- /dev/null
+++ b/tests-demo/test_demo_glm.py
@@ -0,0 +1,179 @@
+"""
+Demo test showing proper Scenario usage with tool call validation - GLM Model
+
+This demonstrates the key capabilities for the customer demo using Nebius GLM model.
+"""
+import asyncio
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="nebius/zai-org/GLM-4.7-FP8")
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+        return response.content
+
+async def test_fraud_with_tool_validation():
+    """
+    Demo test: Fraud investigation with proper tool call validation
+
+    This shows how Scenario can validate that the right tools are called
+    at the right time for specific business scenarios.
+    """
+
+    def check_fraud_tool_usage(state: scenario.ScenarioState):
+        """Custom assertion to verify fraud investigation tools were used"""
+        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
+
+        # Check if customer exploration was called
+        has_exploration = state.has_tool_call("explore_customer_account")
+        print(f"   explore_customer_account called: {has_exploration}")
+
+        if has_exploration:
+            tool_call = state.last_tool_call("explore_customer_account")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                print(f"   Tool arguments: {args}")
+
+                # Validate the arguments make sense for fraud
+                query = args.get("query", "").lower()
+                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
+                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
+                print(f"   Query contains fraud context: {has_fraud_context}")
+
+        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        # At least one appropriate tool should be called for fraud concerns
+        appropriate_response = has_exploration or has_escalation
+        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
+
+        return appropriate_response
+
+    print("🎭 Running fraud investigation demo with tool validation (GLM)...")
+
+    result = await scenario.run(
+        name="fraud investigation demo - GLM",
+        description="""
+            Customer reports suspicious transactions and potential fraud.
+            The agent should take this seriously and use appropriate tools
+            to investigate or escalate the security concern.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent takes fraud concerns seriously",
+                "Agent offers security measures or investigation",
+                "Agent maintains professional and reassuring tone"
+            ])
+        ],
+        script=[
+            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.agent(),
+            check_fraud_tool_usage,
+            scenario.user("Yes, please help me secure my account immediately!"),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    if result.reasoning:
+        print(f"Reasoning: {result.reasoning}")
+
+    return result.success
+
+async def test_escalation_detection():
+    """
+    Demo test: Escalation detection with custom validation
+    """
+
+    def verify_escalation_logic(state: scenario.ScenarioState):
+        """Check that angry customers trigger escalation"""
+        print(f"\n🚨 Checking escalation logic...")
+
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        if has_escalation:
+            tool_call = state.last_tool_call("escalate_to_human")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                reason = args.get("reason", "")
+                urgency = args.get("urgency", "medium")
+                print(f"   Escalation reason: {reason}")
+                print(f"   Urgency level: {urgency}")
+
+        print(f"   ✅ Escalation handled: {has_escalation}")
+        return has_escalation
+
+    print("\n🎭 Running escalation detection demo (GLM)...")
+
+    result = await scenario.run(
+        name="escalation detection demo - GLM",
+        description="""
+            Frustrated customer demands to speak with a manager.
+            Agent should recognize the escalation need and handle appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent acknowledges customer frustration",
+                "Agent offers escalation when demanded",
+                "Agent maintains professionalism"
+            ])
+        ],
+        script=[
+            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.agent(),
+            verify_escalation_logic,
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    return result.success
+
+async def main():
+    """Run the demo tests"""
+    print("🚀 Bank Customer Support Agent - Scenario Demo (GLM)")
+    print("=" * 50)
+
+    # Test 1: Fraud Investigation
+    fraud_passed = await test_fraud_with_tool_validation()
+
+    # Test 2: Escalation Detection
+    escalation_passed = await test_escalation_detection()
+
+    print("\n" + "=" * 50)
+    print("📈 Demo Summary:")
+    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
+    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+
+    overall_success = fraud_passed and escalation_passed
+    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+
+    if overall_success:
+        print("\n🎉 Demo ready! This shows:")
+        print("   • Proper Scenario framework usage")
+        print("   • Tool calling validation")
+        print("   • Custom assertions for business logic")
+        print("   • Realistic user simulation")
+        print("   • Automated quality assessment")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py
new file mode 100644
index 0000000..40972ea
--- /dev/null
+++ b/tests-demo/test_demo_minimax.py
@@ -0,0 +1,179 @@
+"""
+Demo test showing proper Scenario usage with tool call validation - MiniMax Model
+
+This demonstrates the key capabilities for the customer demo using Nebius MiniMax model.
+"""
+import asyncio
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="nebius/MiniMaxAI/MiniMax-M2.1")
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+        return response.content
+
+async def test_fraud_with_tool_validation():
+    """
+    Demo test: Fraud investigation with proper tool call validation
+
+    This shows how Scenario can validate that the right tools are called
+    at the right time for specific business scenarios.
+    """
+
+    def check_fraud_tool_usage(state: scenario.ScenarioState):
+        """Custom assertion to verify fraud investigation tools were used"""
+        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
+
+        # Check if customer exploration was called
+        has_exploration = state.has_tool_call("explore_customer_account")
+        print(f"   explore_customer_account called: {has_exploration}")
+
+        if has_exploration:
+            tool_call = state.last_tool_call("explore_customer_account")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                print(f"   Tool arguments: {args}")
+
+                # Validate the arguments make sense for fraud
+                query = args.get("query", "").lower()
+                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
+                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
+                print(f"   Query contains fraud context: {has_fraud_context}")
+
+        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        # At least one appropriate tool should be called for fraud concerns
+        appropriate_response = has_exploration or has_escalation
+        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
+
+        return appropriate_response
+
+    print("🎭 Running fraud investigation demo with tool validation (MiniMax)...")
+
+    result = await scenario.run(
+        name="fraud investigation demo - MiniMax",
+        description="""
+            Customer reports suspicious transactions and potential fraud.
+            The agent should take this seriously and use appropriate tools
+            to investigate or escalate the security concern.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent takes fraud concerns seriously",
+                "Agent offers security measures or investigation",
+                "Agent maintains professional and reassuring tone"
+            ])
+        ],
+        script=[
+            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.agent(),
+            check_fraud_tool_usage,
+            scenario.user("Yes, please help me secure my account immediately!"),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    if result.reasoning:
+        print(f"Reasoning: {result.reasoning}")
+
+    return result.success
+
+async def test_escalation_detection():
+    """
+    Demo test: Escalation detection with custom validation
+    """
+
+    def verify_escalation_logic(state: scenario.ScenarioState):
+        """Check that angry customers trigger escalation"""
+        print(f"\n🚨 Checking escalation logic...")
+
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        if has_escalation:
+            tool_call = state.last_tool_call("escalate_to_human")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                reason = args.get("reason", "")
+                urgency = args.get("urgency", "medium")
+                print(f"   Escalation reason: {reason}")
+                print(f"   Urgency level: {urgency}")
+
+        print(f"   ✅ Escalation handled: {has_escalation}")
+        return has_escalation
+
+    print("\n🎭 Running escalation detection demo (MiniMax)...")
+
+    result = await scenario.run(
+        name="escalation detection demo - MiniMax",
+        description="""
+            Frustrated customer demands to speak with a manager.
+            Agent should recognize the escalation need and handle appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent acknowledges customer frustration",
+                "Agent offers escalation when demanded",
+                "Agent maintains professionalism"
+            ])
+        ],
+        script=[
+            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.agent(),
+            verify_escalation_logic,
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    return result.success
+
+async def main():
+    """Run the demo tests"""
+    print("🚀 Bank Customer Support Agent - Scenario Demo (MiniMax)")
+    print("=" * 50)
+
+    # Test 1: Fraud Investigation
+    fraud_passed = await test_fraud_with_tool_validation()
+
+    # Test 2: Escalation Detection
+    escalation_passed = await test_escalation_detection()
+
+    print("\n" + "=" * 50)
+    print("📈 Demo Summary:")
+    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
+    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+
+    overall_success = fraud_passed and escalation_passed
+    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+
+    if overall_success:
+        print("\n🎉 Demo ready! This shows:")
+        print("   • Proper Scenario framework usage")
+        print("   • Tool calling validation")
+        print("   • Custom assertions for business logic")
+        print("   • Realistic user simulation")
+        print("   • Automated quality assessment")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py
new file mode 100644
index 0000000..5f85012
--- /dev/null
+++ b/tests-demo/test_demo_openai.py
@@ -0,0 +1,179 @@
+"""
+Demo test showing proper Scenario usage with tool call validation - OpenAI Model
+
+This demonstrates the key capabilities for the customer demo using OpenAI models.
+"""
+import asyncio
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="openai/gpt-4o-mini")
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+        return response.content
+
+async def test_fraud_with_tool_validation():
+    """
+    Demo test: Fraud investigation with proper tool call validation
+
+    This shows how Scenario can validate that the right tools are called
+    at the right time for specific business scenarios.
+    """
+
+    def check_fraud_tool_usage(state: scenario.ScenarioState):
+        """Custom assertion to verify fraud investigation tools were used"""
+        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
+
+        # Check if customer exploration was called
+        has_exploration = state.has_tool_call("explore_customer_account")
+        print(f"   explore_customer_account called: {has_exploration}")
+
+        if has_exploration:
+            tool_call = state.last_tool_call("explore_customer_account")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                print(f"   Tool arguments: {args}")
+
+                # Validate the arguments make sense for fraud
+                query = args.get("query", "").lower()
+                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
+                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
+                print(f"   Query contains fraud context: {has_fraud_context}")
+
+        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        # At least one appropriate tool should be called for fraud concerns
+        appropriate_response = has_exploration or has_escalation
+        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
+
+        return appropriate_response
+
+    print("🎭 Running fraud investigation demo with tool validation (OpenAI)...")
+
+    result = await scenario.run(
+        name="fraud investigation demo - OpenAI",
+        description="""
+            Customer reports suspicious transactions and potential fraud.
+            The agent should take this seriously and use appropriate tools
+            to investigate or escalate the security concern.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent takes fraud concerns seriously",
+                "Agent offers security measures or investigation",
+                "Agent maintains professional and reassuring tone"
+            ])
+        ],
+        script=[
+            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.agent(),
+            check_fraud_tool_usage,
+            scenario.user("Yes, please help me secure my account immediately!"),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    if result.reasoning:
+        print(f"Reasoning: {result.reasoning}")
+
+    return result.success
+
+async def test_escalation_detection():
+    """
+    Demo test: Escalation detection with custom validation
+    """
+
+    def verify_escalation_logic(state: scenario.ScenarioState):
+        """Check that angry customers trigger escalation"""
+        print(f"\n🚨 Checking escalation logic...")
+
+        has_escalation = state.has_tool_call("escalate_to_human")
+        print(f"   escalate_to_human called: {has_escalation}")
+
+        if has_escalation:
+            tool_call = state.last_tool_call("escalate_to_human")
+            if tool_call:
+                args = json.loads(tool_call["function"]["arguments"])
+                reason = args.get("reason", "")
+                urgency = args.get("urgency", "medium")
+                print(f"   Escalation reason: {reason}")
+                print(f"   Urgency level: {urgency}")
+
+        print(f"   ✅ Escalation handled: {has_escalation}")
+        return has_escalation
+
+    print("\n🎭 Running escalation detection demo (OpenAI)...")
+
+    result = await scenario.run(
+        name="escalation detection demo - OpenAI",
+        description="""
+            Frustrated customer demands to speak with a manager.
+            Agent should recognize the escalation need and handle appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(criteria=[
+                "Agent acknowledges customer frustration",
+                "Agent offers escalation when demanded",
+                "Agent maintains professionalism"
+            ])
+        ],
+        script=[
+            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.agent(),
+            verify_escalation_logic,
+            scenario.judge(),
+        ],
+    )
+
+    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
+    return result.success
+
+async def main():
+    """Run the demo tests"""
+    print("🚀 Bank Customer Support Agent - Scenario Demo (OpenAI)")
+    print("=" * 50)
+
+    # Test 1: Fraud Investigation
+    fraud_passed = await test_fraud_with_tool_validation()
+
+    # Test 2: Escalation Detection
+    escalation_passed = await test_escalation_detection()
+
+    print("\n" + "=" * 50)
+    print("📈 Demo Summary:")
+    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
+    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+
+    overall_success = fraud_passed and escalation_passed
+    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+
+    if overall_success:
+        print("\n🎉 Demo ready! This shows:")
+        print("   • Proper Scenario framework usage")
+        print("   • Tool calling validation")
+        print("   • Custom assertions for business logic")
+        print("   • Realistic user simulation")
+        print("   • Automated quality assessment")
+
+if __name__ == "__main__":
+    asyncio.run(main())

From f2e97c8f0a50e7ffee9fcb2e80bf91841f7061a7 Mon Sep 17 00:00:00 2001
From: aryansharma28 <aryansharma2k2@gmail.com>
Date: Wed, 11 Feb 2026 16:24:24 +0100
Subject: [PATCH 3/6] fix: tests

---
 tests-demo/test_demo_deepseek.py | 291 ++++++++++++++++++-------------
 tests-demo/test_demo_glm.py      | 281 +++++++++++++++++------------
 tests-demo/test_demo_minimax.py  | 276 +++++++++++++++++------------
 tests-demo/test_demo_openai.py   | 287 ++++++++++++++++++------------
 4 files changed, 683 insertions(+), 452 deletions(-)

diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py
index 91a4060..e71ab3f 100644
--- a/tests-demo/test_demo_deepseek.py
+++ b/tests-demo/test_demo_deepseek.py
@@ -1,9 +1,11 @@
 """
-Demo test showing proper Scenario usage with tool call validation - DeepSeek Model
+Tests for the main bank customer support agent - DeepSeek Model
 
-This demonstrates the key capabilities for the customer demo using Nebius DeepSeek model.
+These tests cover real business scenarios and validate tool calling behavior
+using Nebius DeepSeek-V3.2 model for evaluation.
 """
 import asyncio
+import pytest
 import json
 import sys
 import os
@@ -18,162 +20,213 @@
 dotenv.load_dotenv()
 scenario.configure(default_model="nebius/deepseek-ai/DeepSeek-V3.2")
 
+
 class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
     async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         message_content = input.last_new_user_message_str()
         response = support_agent.run(message_content)
-        return response.content
-
-async def test_fraud_with_tool_validation():
-    """
-    Demo test: Fraud investigation with proper tool call validation
-
-    This shows how Scenario can validate that the right tools are called
-    at the right time for specific business scenarios.
-    """
-
-    def check_fraud_tool_usage(state: scenario.ScenarioState):
-        """Custom assertion to verify fraud investigation tools were used"""
-        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
-
-        # Check if customer exploration was called
-        has_exploration = state.has_tool_call("explore_customer_account")
-        print(f"   explore_customer_account called: {has_exploration}")
-
-        if has_exploration:
-            tool_call = state.last_tool_call("explore_customer_account")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                print(f"   Tool arguments: {args}")
-
-                # Validate the arguments make sense for fraud
-                query = args.get("query", "").lower()
-                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
-                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
-                print(f"   Query contains fraud context: {has_fraud_context}")
-
-        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
 
-        # At least one appropriate tool should be called for fraud concerns
-        appropriate_response = has_exploration or has_escalation
-        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
-
-        return appropriate_response
-
-    print("🎭 Running fraud investigation demo with tool validation (DeepSeek)...")
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            return relevant_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    # Custom assertions for tool calling validation
+    def check_customer_exploration_called(state: scenario.ScenarioState):
+        """Verify the agent called explore_customer_account for fraud investigation"""
+        assert state.has_tool_call(
+            "explore_customer_account"
+        ), "Agent should call explore_customer_account for fraud concerns"
+
+        # Check the tool was called with appropriate parameters
+        tool_call = state.last_tool_call("explore_customer_account")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            assert "customer_id" in args, "Tool call should include customer_id"
+
+    def verify_no_inappropriate_tools(state: scenario.ScenarioState):
+        """Ensure agent doesn't use inappropriate tools for fraud scenarios"""
+        # Should not use message suggestions for clear security issues
+        assert not state.has_tool_call(
+            "get_message_suggestion"
+        ), "Agent should not need message suggestions for clear fraud cases"
 
     result = await scenario.run(
-        name="fraud investigation demo - DeepSeek",
+        name="fraud investigation and card security - DeepSeek",
         description="""
-            Customer reports suspicious transactions and potential fraud.
-            The agent should take this seriously and use appropriate tools
-            to investigate or escalate the security concern.
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            The agent should use customer exploration tools to analyze the account.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent takes fraud concerns seriously",
-                "Agent offers security measures or investigation",
-                "Agent maintains professional and reassuring tone"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent offers concrete security actions like card freezing",
+                    "Agent provides clear next steps for fraud investigation",
+                    "Agent maintains professional and reassuring tone",
+                ]
+            ),
         ],
         script=[
-            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
             scenario.agent(),
-            check_fraud_tool_usage,
-            scenario.user("Yes, please help me secure my account immediately!"),
+            check_customer_exploration_called,
+            scenario.user(
+                "There's an $85 charge at Amazon and a $45 charge at some gas station. I definitely didn't make these purchases."
+            ),
             scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm worried about more charges."
+            ),
+            scenario.agent(),
+            verify_no_inappropriate_tools,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    if result.reasoning:
-        print(f"Reasoning: {result.reasoning}")
-
-    return result.success
-
-async def test_escalation_detection():
-    """
-    Demo test: Escalation detection with custom validation
-    """
-
-    def verify_escalation_logic(state: scenario.ScenarioState):
-        """Check that angry customers trigger escalation"""
-        print(f"\n🚨 Checking escalation logic...")
-
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
-
-        if has_escalation:
-            tool_call = state.last_tool_call("escalate_to_human")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                reason = args.get("reason", "")
-                urgency = args.get("urgency", "medium")
-                print(f"   Escalation reason: {reason}")
-                print(f"   Urgency level: {urgency}")
-
-        print(f"   ✅ Escalation handled: {has_escalation}")
-        return has_escalation
-
-    print("\n🎭 Running escalation detection demo (DeepSeek)...")
+    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
 
     result = await scenario.run(
-        name="escalation detection demo - DeepSeek",
+        name="customer escalation to human agent - DeepSeek",
         description="""
-            Frustrated customer demands to speak with a manager.
-            Agent should recognize the escalation need and handle appropriately.
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent acknowledges customer frustration",
-                "Agent offers escalation when demanded",
-                "Agent maintains professionalism"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
         ],
         script=[
-            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
             scenario.agent(),
-            verify_escalation_logic,
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    return result.success
+    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
 
-async def main():
-    """Run the demo tests"""
-    print("🚀 Bank Customer Support Agent - Scenario Demo (DeepSeek)")
-    print("=" * 50)
 
-    # Test 1: Fraud Investigation
-    fraud_passed = await test_fraud_with_tool_validation()
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    def check_message_suggestion_called(state: scenario.ScenarioState):
+        """Verify agent uses knowledge base for complex multi-part issues"""
+        assert state.has_tool_call(
+            "get_message_suggestion"
+        ), "Agent should use message suggestions for complex banking issues"
 
-    # Test 2: Escalation Detection
-    escalation_passed = await test_escalation_detection()
+        tool_call = state.last_tool_call("get_message_suggestion")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            query = args.get("customer_query", "").lower()
+            assert any(
+                keyword in query for keyword in ["lock", "fee", "deposit", "multiple"]
+            ), "Tool call should reference the customer's specific issues"
 
-    print("\n" + "=" * 50)
-    print("📈 Demo Summary:")
-    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
-    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+    result = await scenario.run(
+        name="complex multi-issue banking problem - DeepSeek",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help
+            and the agent should use knowledge base guidance.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent addresses all parts of the multi-faceted problem",
+                    "Agent provides systematic approach to resolving issues",
+                    "Agent shows empathy for customer frustration",
+                    "Agent offers clear next steps for each problem",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit."
+            ),
+            scenario.agent(),
+            check_message_suggestion_called,
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
 
-    overall_success = fraud_passed and escalation_passed
-    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+    assert (
+        result.success
+    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
 
-    if overall_success:
-        print("\n🎉 Demo ready! This shows:")
-        print("   • Proper Scenario framework usage")
-        print("   • Tool calling validation")
-        print("   • Custom assertions for business logic")
-        print("   • Realistic user simulation")
-        print("   • Automated quality assessment")
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py
index cc267c6..0c2d863 100644
--- a/tests-demo/test_demo_glm.py
+++ b/tests-demo/test_demo_glm.py
@@ -1,9 +1,11 @@
 """
-Demo test showing proper Scenario usage with tool call validation - GLM Model
+Tests for the main bank customer support agent - GLM Model
 
-This demonstrates the key capabilities for the customer demo using Nebius GLM model.
+These tests cover real business scenarios and validate tool calling behavior
+using Nebius GLM-4.7-FP8 model for evaluation.
 """
 import asyncio
+import pytest
 import json
 import sys
 import os
@@ -18,162 +20,221 @@
 dotenv.load_dotenv()
 scenario.configure(default_model="nebius/zai-org/GLM-4.7-FP8")
 
+
 class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
     async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         message_content = input.last_new_user_message_str()
         response = support_agent.run(message_content)
-        return response.content
-
-async def test_fraud_with_tool_validation():
-    """
-    Demo test: Fraud investigation with proper tool call validation
-
-    This shows how Scenario can validate that the right tools are called
-    at the right time for specific business scenarios.
-    """
-
-    def check_fraud_tool_usage(state: scenario.ScenarioState):
-        """Custom assertion to verify fraud investigation tools were used"""
-        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
 
-        # Check if customer exploration was called
-        has_exploration = state.has_tool_call("explore_customer_account")
-        print(f"   explore_customer_account called: {has_exploration}")
-
-        if has_exploration:
-            tool_call = state.last_tool_call("explore_customer_account")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                print(f"   Tool arguments: {args}")
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            return relevant_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    # Custom assertions for tool calling validation
+    def check_customer_exploration_called(state: scenario.ScenarioState):
+        """Verify the agent called explore_customer_account for fraud investigation"""
+        assert state.has_tool_call(
+            "explore_customer_account"
+        ), "Agent should call explore_customer_account for fraud concerns"
+
+        # Check the tool was called with appropriate parameters
+        tool_call = state.last_tool_call("explore_customer_account")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            assert "customer_id" in args, "Tool call should include customer_id"
+
+    def verify_no_inappropriate_tools(state: scenario.ScenarioState):
+        """Ensure agent doesn't use inappropriate tools for fraud scenarios"""
+        # Should not use message suggestions for clear security issues
+        assert not state.has_tool_call(
+            "get_message_suggestion"
+        ), "Agent should not need message suggestions for clear fraud cases"
 
-                # Validate the arguments make sense for fraud
-                query = args.get("query", "").lower()
-                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
-                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
-                print(f"   Query contains fraud context: {has_fraud_context}")
+    result = await scenario.run(
+        name="fraud investigation and card security - GLM",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            The agent should use customer exploration tools to analyze the account.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent offers concrete security actions like card freezing",
+                    "Agent provides clear next steps for fraud investigation",
+                    "Agent maintains professional and reassuring tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            check_customer_exploration_called,
+            scenario.user(
+                "There's an $85 charge at Amazon and a $45 charge at some gas station. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm worried about more charges."
+            ),
+            scenario.agent(),
+            verify_no_inappropriate_tools,
+            scenario.judge(),
+        ],
+    )
 
-        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
+    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
 
-        # At least one appropriate tool should be called for fraud concerns
-        appropriate_response = has_exploration or has_escalation
-        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
 
-        return appropriate_response
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
 
-    print("🎭 Running fraud investigation demo with tool validation (GLM)...")
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
 
     result = await scenario.run(
-        name="fraud investigation demo - GLM",
+        name="customer escalation to human agent - GLM",
         description="""
-            Customer reports suspicious transactions and potential fraud.
-            The agent should take this seriously and use appropriate tools
-            to investigate or escalate the security concern.
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent takes fraud concerns seriously",
-                "Agent offers security measures or investigation",
-                "Agent maintains professional and reassuring tone"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
         ],
         script=[
-            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
             scenario.agent(),
-            check_fraud_tool_usage,
-            scenario.user("Yes, please help me secure my account immediately!"),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
             scenario.agent(),
+            check_escalation_called,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    if result.reasoning:
-        print(f"Reasoning: {result.reasoning}")
-
-    return result.success
+    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
 
-async def test_escalation_detection():
-    """
-    Demo test: Escalation detection with custom validation
-    """
-
-    def verify_escalation_logic(state: scenario.ScenarioState):
-        """Check that angry customers trigger escalation"""
-        print(f"\n🚨 Checking escalation logic...")
 
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    def check_appropriate_urgency_response(state: scenario.ScenarioState):
+        """Verify agent responds appropriately to business urgency"""
+        # For urgent business issues, agent should either:
+        # 1. Escalate immediately, OR
+        # 2. Use customer exploration to provide immediate solutions
         has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
+        has_exploration = state.has_tool_call("explore_customer_account")
 
+        assert (
+            has_escalation or has_exploration
+        ), "Agent should either escalate urgent business issues or explore customer account for immediate solutions"
+
+        # Check that urgency is reflected in tool call parameters
         if has_escalation:
             tool_call = state.last_tool_call("escalate_to_human")
             if tool_call:
                 args = json.loads(tool_call["function"]["arguments"])
-                reason = args.get("reason", "")
                 urgency = args.get("urgency", "medium")
-                print(f"   Escalation reason: {reason}")
-                print(f"   Urgency level: {urgency}")
-
-        print(f"   ✅ Escalation handled: {has_escalation}")
-        return has_escalation
-
-    print("\n🎭 Running escalation detection demo (GLM)...")
+                assert (
+                    urgency == "high"
+                ), "Business urgency should be marked as high priority"
 
     result = await scenario.run(
-        name="escalation detection demo - GLM",
+        name="urgent business account problem - GLM",
         description="""
-            Frustrated customer demands to speak with a manager.
-            Agent should recognize the escalation need and handle appropriately.
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This requires immediate
+            attention and appropriate priority handling.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent acknowledges customer frustration",
-                "Agent offers escalation when demanded",
-                "Agent maintains professionalism"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent recognizes the business urgency and impact",
+                    "Agent treats the issue with appropriate priority",
+                    "Agent offers immediate assistance or escalation",
+                    "Agent provides clear timeline for resolution",
+                ]
+            ),
         ],
         script=[
-            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute!"
+            ),
+            scenario.agent(),
+            check_appropriate_urgency_response,
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
             scenario.agent(),
-            verify_escalation_logic,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    return result.success
-
-async def main():
-    """Run the demo tests"""
-    print("🚀 Bank Customer Support Agent - Scenario Demo (GLM)")
-    print("=" * 50)
-
-    # Test 1: Fraud Investigation
-    fraud_passed = await test_fraud_with_tool_validation()
-
-    # Test 2: Escalation Detection
-    escalation_passed = await test_escalation_detection()
-
-    print("\n" + "=" * 50)
-    print("📈 Demo Summary:")
-    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
-    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
-
-    overall_success = fraud_passed and escalation_passed
-    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
 
-    if overall_success:
-        print("\n🎉 Demo ready! This shows:")
-        print("   • Proper Scenario framework usage")
-        print("   • Tool calling validation")
-        print("   • Custom assertions for business logic")
-        print("   • Realistic user simulation")
-        print("   • Automated quality assessment")
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py
index 40972ea..3eebc2b 100644
--- a/tests-demo/test_demo_minimax.py
+++ b/tests-demo/test_demo_minimax.py
@@ -1,9 +1,11 @@
 """
-Demo test showing proper Scenario usage with tool call validation - MiniMax Model
+Tests for the main bank customer support agent - MiniMax Model
 
-This demonstrates the key capabilities for the customer demo using Nebius MiniMax model.
+These tests cover real business scenarios and validate tool calling behavior
+using Nebius MiniMax-M2.1 model for evaluation.
 """
 import asyncio
+import pytest
 import json
 import sys
 import os
@@ -18,162 +20,222 @@
 dotenv.load_dotenv()
 scenario.configure(default_model="nebius/MiniMaxAI/MiniMax-M2.1")
 
+
 class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
     async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         message_content = input.last_new_user_message_str()
         response = support_agent.run(message_content)
-        return response.content
 
-async def test_fraud_with_tool_validation():
-    """
-    Demo test: Fraud investigation with proper tool call validation
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            return relevant_messages
 
-    This shows how Scenario can validate that the right tools are called
-    at the right time for specific business scenarios.
-    """
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
 
-    def check_fraud_tool_usage(state: scenario.ScenarioState):
-        """Custom assertion to verify fraud investigation tools were used"""
-        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
 
-        # Check if customer exploration was called
-        has_exploration = state.has_tool_call("explore_customer_account")
-        print(f"   explore_customer_account called: {has_exploration}")
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_comprehensive_tool_coordination():
+    """Test a scenario that uses multiple tools in sequence"""
 
-        if has_exploration:
-            tool_call = state.last_tool_call("explore_customer_account")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                print(f"   Tool arguments: {args}")
+    # Track which tools were called
+    tools_called = []
 
-                # Validate the arguments make sense for fraud
-                query = args.get("query", "").lower()
-                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
-                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
-                print(f"   Query contains fraud context: {has_fraud_context}")
+    def track_customer_exploration(state: scenario.ScenarioState):
+        if state.has_tool_call("explore_customer_account"):
+            tools_called.append("explore_customer_account")
 
-        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
+    def track_message_suggestion(state: scenario.ScenarioState):
+        if state.has_tool_call("get_message_suggestion"):
+            tools_called.append("get_message_suggestion")
 
-        # At least one appropriate tool should be called for fraud concerns
-        appropriate_response = has_exploration or has_escalation
-        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
+    def track_conversation_summary(state: scenario.ScenarioState):
+        if state.has_tool_call("get_conversation_summary"):
+            tools_called.append("get_conversation_summary")
 
-        return appropriate_response
+    def validate_tool_coordination(state: scenario.ScenarioState):
+        """Ensure agent used appropriate tools throughout the conversation"""
+        # Should have used customer exploration for account analysis
+        assert (
+            "explore_customer_account" in tools_called
+        ), "Agent should explore customer account for spending analysis"
 
-    print("🎭 Running fraud investigation demo with tool validation (MiniMax)...")
+        # Verify the conversation has good depth (multiple exchanges)
+        user_messages = [m for m in state.messages if m["role"] == "user"]
+        agent_messages = [m for m in state.messages if m["role"] == "assistant"]
+        assert len(user_messages) >= 3, "Conversation should have multiple user turns"
+        assert len(agent_messages) >= 3, "Agent should respond multiple times"
 
     result = await scenario.run(
-        name="fraud investigation demo - MiniMax",
+        name="comprehensive account analysis and advice - MiniMax",
         description="""
-            Customer reports suspicious transactions and potential fraud.
-            The agent should take this seriously and use appropriate tools
-            to investigate or escalate the security concern.
+            Customer wants to understand their spending patterns and get financial advice.
+            This requires account exploration, potentially knowledge base guidance,
+            and possibly conversation analysis. The agent should coordinate multiple tools effectively.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent takes fraud concerns seriously",
-                "Agent offers security measures or investigation",
-                "Agent maintains professional and reassuring tone"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent provides personalized insights based on account data",
+                    "Agent offers actionable financial recommendations",
+                    "Agent asks relevant follow-up questions",
+                    "Agent coordinates multiple information sources effectively",
+                ]
+            ),
         ],
         script=[
-            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.user(
+                "I want to get better at managing my money. Can you analyze my spending and help me understand where I can improve?"
+            ),
             scenario.agent(),
-            check_fraud_tool_usage,
-            scenario.user("Yes, please help me secure my account immediately!"),
+            track_customer_exploration,
+            scenario.user(
+                "That's helpful! Can you also suggest a realistic budget based on my spending patterns and give me specific advice?"
+            ),
             scenario.agent(),
+            track_message_suggestion,
+            scenario.user(
+                "This conversation has been really valuable. Can you summarize the key insights and recommendations we discussed?"
+            ),
+            scenario.agent(),
+            track_conversation_summary,
+            validate_tool_coordination,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    if result.reasoning:
-        print(f"Reasoning: {result.reasoning}")
-
-    return result.success
-
-async def test_escalation_detection():
-    """
-    Demo test: Escalation detection with custom validation
-    """
-
-    def verify_escalation_logic(state: scenario.ScenarioState):
-        """Check that angry customers trigger escalation"""
-        print(f"\n🚨 Checking escalation logic...")
+    assert result.success, f"Tool coordination test failed: {result.failure_reason}"  # type: ignore
 
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
 
-        if has_escalation:
-            tool_call = state.last_tool_call("escalate_to_human")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                reason = args.get("reason", "")
-                urgency = args.get("urgency", "medium")
-                print(f"   Escalation reason: {reason}")
-                print(f"   Urgency level: {urgency}")
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
 
-        print(f"   ✅ Escalation handled: {has_escalation}")
-        return has_escalation
-
-    print("\n🎭 Running escalation detection demo (MiniMax)...")
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
 
     result = await scenario.run(
-        name="escalation detection demo - MiniMax",
+        name="customer escalation to human agent - MiniMax",
         description="""
-            Frustrated customer demands to speak with a manager.
-            Agent should recognize the escalation need and handle appropriately.
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent acknowledges customer frustration",
-                "Agent offers escalation when demanded",
-                "Agent maintains professionalism"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
         ],
         script=[
-            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
             scenario.agent(),
-            verify_escalation_logic,
+            check_escalation_called,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    return result.success
+    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+
 
-async def main():
-    """Run the demo tests"""
-    print("🚀 Bank Customer Support Agent - Scenario Demo (MiniMax)")
-    print("=" * 50)
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_tool_precision_simple_query():
+    """Test that agent doesn't over-use tools for simple queries"""
 
-    # Test 1: Fraud Investigation
-    fraud_passed = await test_fraud_with_tool_validation()
+    def verify_minimal_tool_usage(state: scenario.ScenarioState):
+        """Ensure agent doesn't call unnecessary tools for simple questions"""
+        # Count total tool calls
+        tool_calls = 0
+        for message in state.messages:
+            if message["role"] == "assistant" and "tool_calls" in message:
+                tool_calls += len(message["tool_calls"])  # type: ignore
 
-    # Test 2: Escalation Detection
-    escalation_passed = await test_escalation_detection()
+        # For simple service hours question, should use minimal or no tools
+        assert (
+            tool_calls <= 1
+        ), f"Agent should use minimal tools for simple queries, but used {tool_calls} tool calls"
 
-    print("\n" + "=" * 50)
-    print("📈 Demo Summary:")
-    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
-    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+    result = await scenario.run(
+        name="simple service hours inquiry - MiniMax",
+        description="""
+            Customer asks a simple question about service hours.
+            This should not require complex tool usage or analysis.
+            Agent should respond directly and efficiently.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent responds directly to simple questions",
+                    "Agent provides clear and helpful information",
+                    "Agent doesn't over-complicate simple interactions",
+                    "Agent maintains friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user("What are your customer service hours?"),
+            scenario.agent(),
+            verify_minimal_tool_usage,
+            scenario.user("Thank you, that's helpful."),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
 
-    overall_success = fraud_passed and escalation_passed
-    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+    assert result.success, f"Tool precision test failed: {result.failure_reason}"  # type: ignore
 
-    if overall_success:
-        print("\n🎉 Demo ready! This shows:")
-        print("   • Proper Scenario framework usage")
-        print("   • Tool calling validation")
-        print("   • Custom assertions for business logic")
-        print("   • Realistic user simulation")
-        print("   • Automated quality assessment")
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(test_comprehensive_tool_coordination())
diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py
index 5f85012..27cbac5 100644
--- a/tests-demo/test_demo_openai.py
+++ b/tests-demo/test_demo_openai.py
@@ -1,9 +1,11 @@
 """
-Demo test showing proper Scenario usage with tool call validation - OpenAI Model
+Tests for the main bank customer support agent - OpenAI Model
 
-This demonstrates the key capabilities for the customer demo using OpenAI models.
+These tests cover real business scenarios and validate tool calling behavior
+using OpenAI gpt-4o-mini model for evaluation.
 """
 import asyncio
+import pytest
 import json
 import sys
 import os
@@ -18,162 +20,215 @@
 dotenv.load_dotenv()
 scenario.configure(default_model="openai/gpt-4o-mini")
 
+
 class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
     async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         message_content = input.last_new_user_message_str()
         response = support_agent.run(message_content)
-        return response.content
-
-async def test_fraud_with_tool_validation():
-    """
-    Demo test: Fraud investigation with proper tool call validation
-
-    This shows how Scenario can validate that the right tools are called
-    at the right time for specific business scenarios.
-    """
-
-    def check_fraud_tool_usage(state: scenario.ScenarioState):
-        """Custom assertion to verify fraud investigation tools were used"""
-        print(f"\n🔍 Checking tool calls after {len(state.messages)} messages...")
-
-        # Check if customer exploration was called
-        has_exploration = state.has_tool_call("explore_customer_account")
-        print(f"   explore_customer_account called: {has_exploration}")
-
-        if has_exploration:
-            tool_call = state.last_tool_call("explore_customer_account")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                print(f"   Tool arguments: {args}")
-
-                # Validate the arguments make sense for fraud
-                query = args.get("query", "").lower()
-                fraud_keywords = ["fraud", "security", "unauthorized", "suspicious"]
-                has_fraud_context = any(keyword in query for keyword in fraud_keywords)
-                print(f"   Query contains fraud context: {has_fraud_context}")
 
-        # For demo purposes, let's be flexible - either exploration or escalation is appropriate
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
-
-        # At least one appropriate tool should be called for fraud concerns
-        appropriate_response = has_exploration or has_escalation
-        print(f"   ✅ Appropriate fraud response: {appropriate_response}")
-
-        return appropriate_response
-
-    print("🎭 Running fraud investigation demo with tool validation (OpenAI)...")
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            return relevant_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    # Custom assertions for tool calling validation
+    def check_customer_exploration_called(state: scenario.ScenarioState):
+        """Verify the agent called explore_customer_account for fraud investigation"""
+        assert state.has_tool_call(
+            "explore_customer_account"
+        ), "Agent should call explore_customer_account for fraud concerns"
+
+        # Check the tool was called with appropriate parameters
+        tool_call = state.last_tool_call("explore_customer_account")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            assert "customer_id" in args, "Tool call should include customer_id"
+
+    def verify_no_inappropriate_tools(state: scenario.ScenarioState):
+        """Ensure agent doesn't use inappropriate tools for fraud scenarios"""
+        # Should not use message suggestions for clear security issues
+        assert not state.has_tool_call(
+            "get_message_suggestion"
+        ), "Agent should not need message suggestions for clear fraud cases"
 
     result = await scenario.run(
-        name="fraud investigation demo - OpenAI",
+        name="fraud investigation and card security - OpenAI",
         description="""
-            Customer reports suspicious transactions and potential fraud.
-            The agent should take this seriously and use appropriate tools
-            to investigate or escalate the security concern.
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            The agent should use customer exploration tools to analyze the account.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent takes fraud concerns seriously",
-                "Agent offers security measures or investigation",
-                "Agent maintains professional and reassuring tone"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent offers concrete security actions like card freezing",
+                    "Agent provides clear next steps for fraud investigation",
+                    "Agent maintains professional and reassuring tone",
+                ]
+            ),
         ],
         script=[
-            scenario.user("I think someone stole my card! There are charges I didn't make - $85 at Amazon and $45 at a gas station."),
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            check_customer_exploration_called,
+            scenario.user(
+                "There's an $85 charge at Amazon and a $45 charge at some gas station. I definitely didn't make these purchases."
+            ),
             scenario.agent(),
-            check_fraud_tool_usage,
-            scenario.user("Yes, please help me secure my account immediately!"),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm worried about more charges."
+            ),
             scenario.agent(),
+            verify_no_inappropriate_tools,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    if result.reasoning:
-        print(f"Reasoning: {result.reasoning}")
-
-    return result.success
-
-async def test_escalation_detection():
-    """
-    Demo test: Escalation detection with custom validation
-    """
-
-    def verify_escalation_logic(state: scenario.ScenarioState):
-        """Check that angry customers trigger escalation"""
-        print(f"\n🚨 Checking escalation logic...")
+    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
 
-        has_escalation = state.has_tool_call("escalate_to_human")
-        print(f"   escalate_to_human called: {has_escalation}")
 
-        if has_escalation:
-            tool_call = state.last_tool_call("escalate_to_human")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                reason = args.get("reason", "")
-                urgency = args.get("urgency", "medium")
-                print(f"   Escalation reason: {reason}")
-                print(f"   Urgency level: {urgency}")
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
 
-        print(f"   ✅ Escalation handled: {has_escalation}")
-        return has_escalation
-
-    print("\n🎭 Running escalation detection demo (OpenAI)...")
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
 
     result = await scenario.run(
-        name="escalation detection demo - OpenAI",
+        name="customer escalation to human agent - OpenAI",
         description="""
-            Frustrated customer demands to speak with a manager.
-            Agent should recognize the escalation need and handle appropriately.
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
-            scenario.JudgeAgent(criteria=[
-                "Agent acknowledges customer frustration",
-                "Agent offers escalation when demanded",
-                "Agent maintains professionalism"
-            ])
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
         ],
         script=[
-            scenario.user("This is ridiculous! I want to speak to your manager RIGHT NOW! Nobody can help me with this issue!"),
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
             scenario.agent(),
-            verify_escalation_logic,
+            check_escalation_called,
             scenario.judge(),
         ],
     )
 
-    print(f"\n📊 Test Result: {'✅ PASSED' if result.success else '❌ FAILED'}")
-    return result.success
+    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
 
-async def main():
-    """Run the demo tests"""
-    print("🚀 Bank Customer Support Agent - Scenario Demo (OpenAI)")
-    print("=" * 50)
 
-    # Test 1: Fraud Investigation
-    fraud_passed = await test_fraud_with_tool_validation()
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    def check_message_suggestion_called(state: scenario.ScenarioState):
+        """Verify agent uses knowledge base for complex multi-part issues"""
+        assert state.has_tool_call(
+            "get_message_suggestion"
+        ), "Agent should use message suggestions for complex banking issues"
 
-    # Test 2: Escalation Detection
-    escalation_passed = await test_escalation_detection()
+        tool_call = state.last_tool_call("get_message_suggestion")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            query = args.get("customer_query", "").lower()
+            assert any(
+                keyword in query for keyword in ["lock", "fee", "deposit", "multiple"]
+            ), "Tool call should reference the customer's specific issues"
 
-    print("\n" + "=" * 50)
-    print("📈 Demo Summary:")
-    print(f"   Fraud Investigation: {'✅ PASSED' if fraud_passed else '❌ FAILED'}")
-    print(f"   Escalation Detection: {'✅ PASSED' if escalation_passed else '❌ FAILED'}")
+    result = await scenario.run(
+        name="complex multi-issue banking problem - OpenAI",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help
+            and the agent should use knowledge base guidance.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                criteria=[
+                    "Agent addresses all parts of the multi-faceted problem",
+                    "Agent provides systematic approach to resolving issues",
+                    "Agent shows empathy for customer frustration",
+                    "Agent offers clear next steps for each problem",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit."
+            ),
+            scenario.agent(),
+            check_message_suggestion_called,
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
 
-    overall_success = fraud_passed and escalation_passed
-    print(f"\n🎯 Overall Demo: {'✅ SUCCESS' if overall_success else '❌ NEEDS WORK'}")
+    assert (
+        result.success
+    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
 
-    if overall_success:
-        print("\n🎉 Demo ready! This shows:")
-        print("   • Proper Scenario framework usage")
-        print("   • Tool calling validation")
-        print("   • Custom assertions for business logic")
-        print("   • Realistic user simulation")
-        print("   • Automated quality assessment")
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(test_fraud_investigation_workflow())

From 0d1ef83f3da98ade5d2c20fb80302cd6f42a5d00 Mon Sep 17 00:00:00 2001
From: aryansharma28 <aryansharma2k2@gmail.com>
Date: Fri, 13 Feb 2026 14:57:56 +0100
Subject: [PATCH 4/6] feat: add comprehensive realistic test scenarios for
 agent evaluation

- Add 4 realistic test scenarios across all models (DeepSeek, GLM, MiniMax, OpenAI, Claude)
- Scenarios: fraud investigation, escalation, complex multi-issue, urgent business
- Improved test design with realistic conversation flows and customer ID (CUST_001)
- Removed prescriptive tool assertions in favor of outcome-based evaluation
- Updated judge criteria to focus on real-world customer service quality (5 criteria per test)
- All judges now use GPT-4o for consistent evaluation
- Added Claude Sonnet 4.5 model for comparison
- Test results: Claude Sonnet 4.5 and GLM achieved 100% pass rate
---
 tests-demo/test_demo_claude.py   | 242 +++++++++++++++++++++++++++++++
 tests-demo/test_demo_deepseek.py | 110 +++++++-------
 tests-demo/test_demo_glm.py      | 118 +++++++--------
 tests-demo/test_demo_minimax.py  | 151 +++++++++----------
 tests-demo/test_demo_openai.py   | 112 +++++++-------
 5 files changed, 497 insertions(+), 236 deletions(-)
 create mode 100644 tests-demo/test_demo_claude.py

diff --git a/tests-demo/test_demo_claude.py b/tests-demo/test_demo_claude.py
new file mode 100644
index 0000000..a507806
--- /dev/null
+++ b/tests-demo/test_demo_claude.py
@@ -0,0 +1,242 @@
+"""
+Tests for the main bank customer support agent - Claude Sonnet 4.5 Model
+
+These tests cover real business scenarios and validate tool calling behavior
+using Claude Sonnet 4.5 model for evaluation.
+"""
+import asyncio
+import pytest
+import json
+import sys
+import os
+import dotenv
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import scenario
+from main_support_agent import support_agent
+
+dotenv.load_dotenv()
+scenario.configure(default_model="claude-sonnet-4.5")
+
+
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Convert Agno messages to OpenAI format for Scenario
+        openai_messages = []
+        for message in response.messages or []:
+            if message.role in ["assistant", "user", "system", "tool"]:
+                msg_dict = {"role": message.role, "content": message.content}
+
+                # Add tool calls if present (for assistant messages)
+                if message.tool_calls:
+                    msg_dict["tool_calls"] = message.tool_calls
+
+                # Add tool call ID if present (for tool messages)
+                if hasattr(message, "tool_call_id") and message.tool_call_id:
+                    msg_dict["tool_call_id"] = message.tool_call_id
+
+                openai_messages.append(msg_dict)
+
+        # Return all messages except system and user (Scenario manages the conversation flow)
+        # We need to include tool messages to satisfy OpenAI's requirements
+        relevant_messages = [
+            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
+        ]
+
+        if relevant_messages:
+            return relevant_messages
+
+        # Fallback to content if no relevant messages found
+        return response.content  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_fraud_investigation_workflow():
+    result = await scenario.run(
+        name="fraud investigation and card security - Claude",
+        description="""
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_escalation_workflow():
+    def check_escalation_called(state: scenario.ScenarioState):
+        """Verify agent escalates when customer explicitly demands human help"""
+        assert state.has_tool_call(
+            "escalate_to_human"
+        ), "Agent should escalate when customer demands manager/human help"
+
+        tool_call = state.last_tool_call("escalate_to_human")
+        if tool_call:
+            args = json.loads(tool_call["function"]["arguments"])
+            reason = args.get("reason", "").lower()
+            assert any(
+                keyword in reason
+                for keyword in ["frustrated", "manager", "human", "escalation"]
+            ), "Escalation reason should reflect customer's frustration and demand"
+
+    result = await scenario.run(
+        name="customer escalation to human agent - Claude",
+        description="""
+            Customer has been dealing with an ongoing issue and is frustrated.
+            They explicitly demand to speak with a human agent or manager.
+            The agent should handle this professionally and escalate appropriately.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent acknowledges customer's frustration empathetically",
+                    "Agent offers to escalate when requested",
+                    "Agent provides escalation timeline and process information",
+                    "Agent maintains professionalism despite customer frustration",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service."
+            ),
+            scenario.agent(),
+            check_escalation_called,
+            scenario.judge(),
+        ],
+    )
+
+    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - Claude",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert (
+        result.success
+    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - Claude",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+
+
+if __name__ == "__main__":
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py
index e71ab3f..259374c 100644
--- a/tests-demo/test_demo_deepseek.py
+++ b/tests-demo/test_demo_deepseek.py
@@ -59,42 +59,24 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_fraud_investigation_workflow():
-    # Custom assertions for tool calling validation
-    def check_customer_exploration_called(state: scenario.ScenarioState):
-        """Verify the agent called explore_customer_account for fraud investigation"""
-        assert state.has_tool_call(
-            "explore_customer_account"
-        ), "Agent should call explore_customer_account for fraud concerns"
-
-        # Check the tool was called with appropriate parameters
-        tool_call = state.last_tool_call("explore_customer_account")
-        if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
-            assert "customer_id" in args, "Tool call should include customer_id"
-
-    def verify_no_inappropriate_tools(state: scenario.ScenarioState):
-        """Ensure agent doesn't use inappropriate tools for fraud scenarios"""
-        # Should not use message suggestions for clear security issues
-        assert not state.has_tool_call(
-            "get_message_suggestion"
-        ), "Agent should not need message suggestions for clear fraud cases"
-
     result = await scenario.run(
         name="fraud investigation and card security - DeepSeek",
         description="""
             Customer discovers unauthorized transactions on their account and is worried about fraud.
             They need immediate help to secure their account and investigate the suspicious activity.
-            The agent should use customer exploration tools to analyze the account.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
-                    "Agent offers concrete security actions like card freezing",
-                    "Agent provides clear next steps for fraud investigation",
-                    "Agent maintains professional and reassuring tone",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
                 ]
             ),
         ],
@@ -103,16 +85,14 @@ def verify_no_inappropriate_tools(state: scenario.ScenarioState):
                 "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
             ),
             scenario.agent(),
-            check_customer_exploration_called,
             scenario.user(
-                "There's an $85 charge at Amazon and a $45 charge at some gas station. I definitely didn't make these purchases."
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
             ),
             scenario.agent(),
             scenario.user(
-                "Yes, please help me secure my account right away. I'm worried about more charges."
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
             ),
             scenario.agent(),
-            verify_no_inappropriate_tools,
             scenario.judge(),
         ],
     )
@@ -148,6 +128,7 @@ def check_escalation_called(state: scenario.ScenarioState):
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -176,47 +157,34 @@ def check_escalation_called(state: scenario.ScenarioState):
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_complex_issue_triggers_knowledge_base():
-    def check_message_suggestion_called(state: scenario.ScenarioState):
-        """Verify agent uses knowledge base for complex multi-part issues"""
-        assert state.has_tool_call(
-            "get_message_suggestion"
-        ), "Agent should use message suggestions for complex banking issues"
-
-        tool_call = state.last_tool_call("get_message_suggestion")
-        if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
-            query = args.get("customer_query", "").lower()
-            assert any(
-                keyword in query for keyword in ["lock", "fee", "deposit", "multiple"]
-            ), "Tool call should reference the customer's specific issues"
-
     result = await scenario.run(
         name="complex multi-issue banking problem - DeepSeek",
         description="""
             Customer has multiple interconnected banking problems: locked online banking,
-            unexpected fees, and missing direct deposit. They need systematic help
-            and the agent should use knowledge base guidance.
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
-                    "Agent addresses all parts of the multi-faceted problem",
-                    "Agent provides systematic approach to resolving issues",
-                    "Agent shows empathy for customer frustration",
-                    "Agent offers clear next steps for each problem",
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
                 ]
             ),
         ],
         script=[
             scenario.user(
-                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit."
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
             ),
             scenario.agent(),
-            check_message_suggestion_called,
             scenario.user(
-                "I've tried resetting my password multiple times and I really need access to pay my bills. This is really stressing me out."
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
             ),
             scenario.agent(),
             scenario.judge(),
@@ -228,5 +196,45 @@ def check_message_suggestion_called(state: scenario.ScenarioState):
     ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
 
 
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - DeepSeek",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+
+
 if __name__ == "__main__":
     asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py
index 0c2d863..402563c 100644
--- a/tests-demo/test_demo_glm.py
+++ b/tests-demo/test_demo_glm.py
@@ -60,42 +60,24 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_fraud_investigation_workflow():
-    # Custom assertions for tool calling validation
-    def check_customer_exploration_called(state: scenario.ScenarioState):
-        """Verify the agent called explore_customer_account for fraud investigation"""
-        assert state.has_tool_call(
-            "explore_customer_account"
-        ), "Agent should call explore_customer_account for fraud concerns"
-
-        # Check the tool was called with appropriate parameters
-        tool_call = state.last_tool_call("explore_customer_account")
-        if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
-            assert "customer_id" in args, "Tool call should include customer_id"
-
-    def verify_no_inappropriate_tools(state: scenario.ScenarioState):
-        """Ensure agent doesn't use inappropriate tools for fraud scenarios"""
-        # Should not use message suggestions for clear security issues
-        assert not state.has_tool_call(
-            "get_message_suggestion"
-        ), "Agent should not need message suggestions for clear fraud cases"
-
     result = await scenario.run(
         name="fraud investigation and card security - GLM",
         description="""
             Customer discovers unauthorized transactions on their account and is worried about fraud.
             They need immediate help to secure their account and investigate the suspicious activity.
-            The agent should use customer exploration tools to analyze the account.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
-                    "Agent offers concrete security actions like card freezing",
-                    "Agent provides clear next steps for fraud investigation",
-                    "Agent maintains professional and reassuring tone",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
                 ]
             ),
         ],
@@ -104,16 +86,14 @@ def verify_no_inappropriate_tools(state: scenario.ScenarioState):
                 "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
             ),
             scenario.agent(),
-            check_customer_exploration_called,
             scenario.user(
-                "There's an $85 charge at Amazon and a $45 charge at some gas station. I definitely didn't make these purchases."
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
             ),
             scenario.agent(),
             scenario.user(
-                "Yes, please help me secure my account right away. I'm worried about more charges."
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
             ),
             scenario.agent(),
-            verify_no_inappropriate_tools,
             scenario.judge(),
         ],
     )
@@ -150,6 +130,7 @@ def check_escalation_called(state: scenario.ScenarioState):
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -178,53 +159,32 @@ def check_escalation_called(state: scenario.ScenarioState):
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_urgent_business_scenario():
-    def check_appropriate_urgency_response(state: scenario.ScenarioState):
-        """Verify agent responds appropriately to business urgency"""
-        # For urgent business issues, agent should either:
-        # 1. Escalate immediately, OR
-        # 2. Use customer exploration to provide immediate solutions
-        has_escalation = state.has_tool_call("escalate_to_human")
-        has_exploration = state.has_tool_call("explore_customer_account")
-
-        assert (
-            has_escalation or has_exploration
-        ), "Agent should either escalate urgent business issues or explore customer account for immediate solutions"
-
-        # Check that urgency is reflected in tool call parameters
-        if has_escalation:
-            tool_call = state.last_tool_call("escalate_to_human")
-            if tool_call:
-                args = json.loads(tool_call["function"]["arguments"])
-                urgency = args.get("urgency", "medium")
-                assert (
-                    urgency == "high"
-                ), "Business urgency should be marked as high priority"
-
     result = await scenario.run(
         name="urgent business account problem - GLM",
         description="""
             Business customer has an urgent issue affecting their operations.
-            They can't access funds to pay employees. This requires immediate
-            attention and appropriate priority handling.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
-                    "Agent recognizes the business urgency and impact",
-                    "Agent treats the issue with appropriate priority",
-                    "Agent offers immediate assistance or escalation",
-                    "Agent provides clear timeline for resolution",
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
                 ]
             ),
         ],
         script=[
             scenario.user(
-                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute!"
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
             ),
             scenario.agent(),
-            check_appropriate_urgency_response,
             scenario.user(
                 "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
             ),
@@ -236,5 +196,47 @@ def check_appropriate_urgency_response(state: scenario.ScenarioState):
     assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
 
 
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - GLM",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert (
+        result.success
+    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+
+
 if __name__ == "__main__":
     asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py
index 3eebc2b..5629ba6 100644
--- a/tests-demo/test_demo_minimax.py
+++ b/tests-demo/test_demo_minimax.py
@@ -59,78 +59,46 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
 
 @pytest.mark.agent_test
 @pytest.mark.asyncio
-async def test_comprehensive_tool_coordination():
-    """Test a scenario that uses multiple tools in sequence"""
-
-    # Track which tools were called
-    tools_called = []
-
-    def track_customer_exploration(state: scenario.ScenarioState):
-        if state.has_tool_call("explore_customer_account"):
-            tools_called.append("explore_customer_account")
-
-    def track_message_suggestion(state: scenario.ScenarioState):
-        if state.has_tool_call("get_message_suggestion"):
-            tools_called.append("get_message_suggestion")
-
-    def track_conversation_summary(state: scenario.ScenarioState):
-        if state.has_tool_call("get_conversation_summary"):
-            tools_called.append("get_conversation_summary")
-
-    def validate_tool_coordination(state: scenario.ScenarioState):
-        """Ensure agent used appropriate tools throughout the conversation"""
-        # Should have used customer exploration for account analysis
-        assert (
-            "explore_customer_account" in tools_called
-        ), "Agent should explore customer account for spending analysis"
-
-        # Verify the conversation has good depth (multiple exchanges)
-        user_messages = [m for m in state.messages if m["role"] == "user"]
-        agent_messages = [m for m in state.messages if m["role"] == "assistant"]
-        assert len(user_messages) >= 3, "Conversation should have multiple user turns"
-        assert len(agent_messages) >= 3, "Agent should respond multiple times"
-
+async def test_fraud_investigation_workflow():
     result = await scenario.run(
-        name="comprehensive account analysis and advice - MiniMax",
+        name="fraud investigation and card security - MiniMax",
         description="""
-            Customer wants to understand their spending patterns and get financial advice.
-            This requires account exploration, potentially knowledge base guidance,
-            and possibly conversation analysis. The agent should coordinate multiple tools effectively.
+            Customer discovers unauthorized transactions on their account and is worried about fraud.
+            They need immediate help to secure their account and investigate the suspicious activity.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
-                    "Agent provides personalized insights based on account data",
-                    "Agent offers actionable financial recommendations",
-                    "Agent asks relevant follow-up questions",
-                    "Agent coordinates multiple information sources effectively",
+                    "Agent takes fraud concerns seriously and responds with urgency",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
                 ]
             ),
         ],
         script=[
             scenario.user(
-                "I want to get better at managing my money. Can you analyze my spending and help me understand where I can improve?"
+                "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
             ),
             scenario.agent(),
-            track_customer_exploration,
             scenario.user(
-                "That's helpful! Can you also suggest a realistic budget based on my spending patterns and give me specific advice?"
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
             ),
             scenario.agent(),
-            track_message_suggestion,
             scenario.user(
-                "This conversation has been really valuable. Can you summarize the key insights and recommendations we discussed?"
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
             ),
             scenario.agent(),
-            track_conversation_summary,
-            validate_tool_coordination,
             scenario.judge(),
         ],
     )
 
-    assert result.success, f"Tool coordination test failed: {result.failure_reason}"  # type: ignore
+    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
 
 
 @pytest.mark.agent_test
@@ -162,6 +130,7 @@ def check_escalation_called(state: scenario.ScenarioState):
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -189,53 +158,85 @@ def check_escalation_called(state: scenario.ScenarioState):
 
 @pytest.mark.agent_test
 @pytest.mark.asyncio
-async def test_tool_precision_simple_query():
-    """Test that agent doesn't over-use tools for simple queries"""
-
-    def verify_minimal_tool_usage(state: scenario.ScenarioState):
-        """Ensure agent doesn't call unnecessary tools for simple questions"""
-        # Count total tool calls
-        tool_calls = 0
-        for message in state.messages:
-            if message["role"] == "assistant" and "tool_calls" in message:
-                tool_calls += len(message["tool_calls"])  # type: ignore
-
-        # For simple service hours question, should use minimal or no tools
-        assert (
-            tool_calls <= 1
-        ), f"Agent should use minimal tools for simple queries, but used {tool_calls} tool calls"
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - MiniMax",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert (
+        result.success
+    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+
 
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
     result = await scenario.run(
-        name="simple service hours inquiry - MiniMax",
+        name="urgent business account problem - MiniMax",
         description="""
-            Customer asks a simple question about service hours.
-            This should not require complex tool usage or analysis.
-            Agent should respond directly and efficiently.
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
-                    "Agent responds directly to simple questions",
-                    "Agent provides clear and helpful information",
-                    "Agent doesn't over-complicate simple interactions",
-                    "Agent maintains friendly and professional tone",
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
                 ]
             ),
         ],
         script=[
-            scenario.user("What are your customer service hours?"),
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
             scenario.agent(),
-            verify_minimal_tool_usage,
-            scenario.user("Thank you, that's helpful."),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
             scenario.agent(),
             scenario.judge(),
         ],
     )
 
-    assert result.success, f"Tool precision test failed: {result.failure_reason}"  # type: ignore
+    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
 
 
 if __name__ == "__main__":
-    asyncio.run(test_comprehensive_tool_coordination())
+    asyncio.run(test_fraud_investigation_workflow())
diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py
index 27cbac5..0de5698 100644
--- a/tests-demo/test_demo_openai.py
+++ b/tests-demo/test_demo_openai.py
@@ -18,7 +18,7 @@
 from main_support_agent import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="openai/gpt-4o-mini")
+scenario.configure(default_model="gpt-4o")
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):
@@ -60,42 +60,24 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_fraud_investigation_workflow():
-    # Custom assertions for tool calling validation
-    def check_customer_exploration_called(state: scenario.ScenarioState):
-        """Verify the agent called explore_customer_account for fraud investigation"""
-        assert state.has_tool_call(
-            "explore_customer_account"
-        ), "Agent should call explore_customer_account for fraud concerns"
-
-        # Check the tool was called with appropriate parameters
-        tool_call = state.last_tool_call("explore_customer_account")
-        if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
-            assert "customer_id" in args, "Tool call should include customer_id"
-
-    def verify_no_inappropriate_tools(state: scenario.ScenarioState):
-        """Ensure agent doesn't use inappropriate tools for fraud scenarios"""
-        # Should not use message suggestions for clear security issues
-        assert not state.has_tool_call(
-            "get_message_suggestion"
-        ), "Agent should not need message suggestions for clear fraud cases"
-
     result = await scenario.run(
         name="fraud investigation and card security - OpenAI",
         description="""
             Customer discovers unauthorized transactions on their account and is worried about fraud.
             They need immediate help to secure their account and investigate the suspicious activity.
-            The agent should use customer exploration tools to analyze the account.
+            This tests whether the agent responds with appropriate urgency and offers concrete security actions.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
-                    "Agent offers concrete security actions like card freezing",
-                    "Agent provides clear next steps for fraud investigation",
-                    "Agent maintains professional and reassuring tone",
+                    "Agent gathers necessary information (account details) to investigate",
+                    "Agent offers concrete security actions like card freezing or blocking",
+                    "Agent provides clear next steps for fraud investigation and dispute process",
+                    "Agent maintains professional and reassuring tone throughout",
                 ]
             ),
         ],
@@ -104,16 +86,14 @@ def verify_no_inappropriate_tools(state: scenario.ScenarioState):
                 "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!"
             ),
             scenario.agent(),
-            check_customer_exploration_called,
             scenario.user(
-                "There's an $85 charge at Amazon and a $45 charge at some gas station. I definitely didn't make these purchases."
+                "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases."
             ),
             scenario.agent(),
             scenario.user(
-                "Yes, please help me secure my account right away. I'm worried about more charges."
+                "Yes, please help me secure my account right away. I'm really worried about more charges appearing."
             ),
             scenario.agent(),
-            verify_no_inappropriate_tools,
             scenario.judge(),
         ],
     )
@@ -150,6 +130,7 @@ def check_escalation_called(state: scenario.ScenarioState):
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -178,47 +159,34 @@ def check_escalation_called(state: scenario.ScenarioState):
 @pytest.mark.agent_test
 @pytest.mark.asyncio
 async def test_complex_issue_triggers_knowledge_base():
-    def check_message_suggestion_called(state: scenario.ScenarioState):
-        """Verify agent uses knowledge base for complex multi-part issues"""
-        assert state.has_tool_call(
-            "get_message_suggestion"
-        ), "Agent should use message suggestions for complex banking issues"
-
-        tool_call = state.last_tool_call("get_message_suggestion")
-        if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
-            query = args.get("customer_query", "").lower()
-            assert any(
-                keyword in query for keyword in ["lock", "fee", "deposit", "multiple"]
-            ), "Tool call should reference the customer's specific issues"
-
     result = await scenario.run(
         name="complex multi-issue banking problem - OpenAI",
         description="""
             Customer has multiple interconnected banking problems: locked online banking,
-            unexpected fees, and missing direct deposit. They need systematic help
-            and the agent should use knowledge base guidance.
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
         """,
         agents=[
             BankSupportAgentAdapter(),
             scenario.UserSimulatorAgent(),
             scenario.JudgeAgent(
+                model="gpt-4o",
                 criteria=[
-                    "Agent addresses all parts of the multi-faceted problem",
-                    "Agent provides systematic approach to resolving issues",
-                    "Agent shows empathy for customer frustration",
-                    "Agent offers clear next steps for each problem",
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
                 ]
             ),
         ],
         script=[
             scenario.user(
-                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit."
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
             ),
             scenario.agent(),
-            check_message_suggestion_called,
             scenario.user(
-                "I've tried resetting my password multiple times and I really need access to pay my bills. This is really stressing me out."
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
             ),
             scenario.agent(),
             scenario.judge(),
@@ -230,5 +198,45 @@ def check_message_suggestion_called(state: scenario.ScenarioState):
     ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
 
 
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_urgent_business_scenario():
+    result = await scenario.run(
+        name="urgent business account problem - OpenAI",
+        description="""
+            Business customer has an urgent issue affecting their operations.
+            They can't access funds to pay employees. This tests whether the agent
+            recognizes urgency and takes appropriate high-priority action.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(),
+            scenario.JudgeAgent(
+                model="gpt-4o",
+                criteria=[
+                    "Agent immediately recognizes the business urgency and employee impact",
+                    "Agent responds with high priority and urgency in tone",
+                    "Agent takes concrete action (investigating the freeze or escalating to specialists)",
+                    "Agent provides realistic timeline or sets expectations appropriately",
+                    "Agent offers interim solutions or workarounds if available",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+
+
 if __name__ == "__main__":
     asyncio.run(test_fraud_investigation_workflow())

From 4f3dc36835c88834fe2c86b3aa23a46bd9b7aed3 Mon Sep 17 00:00:00 2001
From: aryansharma28 <aryansharma2k2@gmail.com>
Date: Fri, 13 Feb 2026 15:17:50 +0100
Subject: [PATCH 5/6] refactor: use Claude Sonnet 4.5 for user simulation and
 evaluation

- Update all test files to use Claude Sonnet 4.5 as User Simulator
- Update all test files to use Claude Sonnet 4.5 as Judge
- Change test_demo_openai.py agent model from GPT-4o to openai/gpt-oss-120b (OSS)
- Ensures consistent evaluation across all open-source agent models
- All agent models are now open-source except Claude benchmark file

Agent models tested:
- DeepSeek-V3.2 (Nebius OSS)
- GLM-4.7-FP8 (Nebius OSS)
- MiniMax-M2.1 (Nebius OSS)
- openai/gpt-oss-120b (OSS)
- Claude Sonnet 4.5 (benchmark)
---
 tests-demo/test_demo_claude.py   | 16 ++++++++--------
 tests-demo/test_demo_deepseek.py | 16 ++++++++--------
 tests-demo/test_demo_glm.py      | 16 ++++++++--------
 tests-demo/test_demo_minimax.py  | 16 ++++++++--------
 tests-demo/test_demo_openai.py   | 20 ++++++++++----------
 5 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/tests-demo/test_demo_claude.py b/tests-demo/test_demo_claude.py
index a507806..61ab54a 100644
--- a/tests-demo/test_demo_claude.py
+++ b/tests-demo/test_demo_claude.py
@@ -69,9 +69,9 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
@@ -128,9 +128,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -168,9 +168,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -210,9 +210,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py
index 259374c..6f7572d 100644
--- a/tests-demo/test_demo_deepseek.py
+++ b/tests-demo/test_demo_deepseek.py
@@ -68,9 +68,9 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
@@ -126,9 +126,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -166,9 +166,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -208,9 +208,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py
index 402563c..f54f7a3 100644
--- a/tests-demo/test_demo_glm.py
+++ b/tests-demo/test_demo_glm.py
@@ -69,9 +69,9 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
@@ -128,9 +128,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -168,9 +168,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
@@ -208,9 +208,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py
index 5629ba6..2cd2da7 100644
--- a/tests-demo/test_demo_minimax.py
+++ b/tests-demo/test_demo_minimax.py
@@ -69,9 +69,9 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
@@ -128,9 +128,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -168,9 +168,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -210,9 +210,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py
index 0de5698..e61f3b5 100644
--- a/tests-demo/test_demo_openai.py
+++ b/tests-demo/test_demo_openai.py
@@ -2,7 +2,7 @@
 Tests for the main bank customer support agent - OpenAI Model
 
 These tests cover real business scenarios and validate tool calling behavior
-using OpenAI gpt-4o-mini model for evaluation.
+using OpenAI claude-sonnet-4.5-mini model for evaluation.
 """
 import asyncio
 import pytest
@@ -18,7 +18,7 @@
 from main_support_agent import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="gpt-4o")
+scenario.configure(default_model="openai/gpt-oss-120b")
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):
@@ -69,9 +69,9 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
@@ -128,9 +128,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -168,9 +168,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -210,9 +210,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(),
+            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
             scenario.JudgeAgent(
-                model="gpt-4o",
+                model="claude-sonnet-4.5",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",

From 28498bbecb52c4d00b3c70ad5e8a2b85d07b45c5 Mon Sep 17 00:00:00 2001
From: aryansharma28 <aryansharma2k2@gmail.com>
Date: Tue, 17 Feb 2026 19:42:30 +0100
Subject: [PATCH 6/6] feat: add multi-model support agents and comprehensive
 scenario tests

Add per-model main support agent entrypoints (Claude, DeepSeek, GLM, MiniMax, OpenAI) and expand test scenarios with realistic banking evaluation criteria.
---
 agents/customer_explorer_agent.py |  12 +-
 agents/next_message_agent.py      |  12 +-
 agents/summary_agent.py           |  12 +-
 main_support_agent_claude.py      | 230 +++++++++++++++++++++++++
 main_support_agent_deepseek.py    | 229 +++++++++++++++++++++++++
 main_support_agent_glm.py         | 229 +++++++++++++++++++++++++
 main_support_agent_minimax.py     | 229 +++++++++++++++++++++++++
 main_support_agent_openai.py      | 229 +++++++++++++++++++++++++
 tests-demo/test_demo_claude.py    | 269 ++++++++++++++++++++++++-----
 tests-demo/test_demo_deepseek.py  | 247 +++++++++++++++++++++++++--
 tests-demo/test_demo_glm.py       | 271 ++++++++++++++++++++++++++----
 tests-demo/test_demo_minimax.py   | 247 +++++++++++++++++++++++++--
 tests-demo/test_demo_openai.py    | 249 ++++++++++++++++++++++++---
 13 files changed, 2311 insertions(+), 154 deletions(-)
 create mode 100644 main_support_agent_claude.py
 create mode 100644 main_support_agent_deepseek.py
 create mode 100644 main_support_agent_glm.py
 create mode 100644 main_support_agent_minimax.py
 create mode 100644 main_support_agent_openai.py

diff --git a/agents/customer_explorer_agent.py b/agents/customer_explorer_agent.py
index f23bbda..e1d5ab3 100644
--- a/agents/customer_explorer_agent.py
+++ b/agents/customer_explorer_agent.py
@@ -10,6 +10,7 @@
 from pydantic import BaseModel
 from datetime import datetime, timedelta
 import json
+import agent_config
 
 dotenv.load_dotenv()
 
@@ -117,16 +118,9 @@ def create_customer_explorer_agent() -> Agent:
     """Create and return the customer explorer agent"""
     return Agent(
         name="CustomerExplorerAgent",
-        # model=OpenAIChat(
-        #     id="gpt-4o-mini",
-        #     api_key=os.getenv("OPENAI_API_KEY"),
-        # ),
-        model=Nebius(
-            id="openai/gpt-oss-120b",
-            api_key=os.getenv("NEBIUS_API_KEY"),
-        ),
+        model=agent_config.get_model(),
         description=CUSTOMER_EXPLORER_SYSTEM_PROMPT,
-        add_history_to_context=False,
+        add_history_to_context=True,
     )
 
 def explore_customer_context(
diff --git a/agents/next_message_agent.py b/agents/next_message_agent.py
index 1fbf194..edaa70d 100644
--- a/agents/next_message_agent.py
+++ b/agents/next_message_agent.py
@@ -8,6 +8,7 @@
 from agno.models.openai import OpenAIChat
 from agno.models.nebius import Nebius
 from pydantic import BaseModel
+import agent_config
 
 dotenv.load_dotenv()
 
@@ -132,16 +133,9 @@ def create_next_message_agent() -> Agent:
     """Create and return the next message agent"""
     return Agent(
         name="NextMessageAgent",
-        # model=OpenAIChat(
-        #     id="gpt-4o-mini",
-        #     api_key=os.getenv("OPENAI_API_KEY"),
-        # ),
-        model=Nebius(
-            id="openai/gpt-oss-120b",
-            api_key=os.getenv("NEBIUS_API_KEY"),
-        ),
+        model=agent_config.get_model(),
         description=NEXT_MESSAGE_SYSTEM_PROMPT,
-        add_history_to_context=False,
+        add_history_to_context=True,
     )
 
 def suggest_next_message(
diff --git a/agents/summary_agent.py b/agents/summary_agent.py
index 4002b96..3e4b46a 100644
--- a/agents/summary_agent.py
+++ b/agents/summary_agent.py
@@ -8,6 +8,7 @@
 from agno.models.openai import OpenAIChat
 from agno.models.nebius import Nebius
 from pydantic import BaseModel
+import agent_config
 
 dotenv.load_dotenv()
 
@@ -47,16 +48,9 @@ def create_summary_agent() -> Agent:
     """Create and return the summary agent"""
     return Agent(
         name="SummaryAgent",
-        # model=OpenAIChat(
-        #     id="gpt-4o-mini",
-        #     api_key=os.getenv("OPENAI_API_KEY"),
-        # ),
-        model=Nebius(
-            id="openai/gpt-oss-120b",
-            api_key=os.getenv("NEBIUS_API_KEY"),
-        ),
+        model=agent_config.get_model(),
         description=SUMMARY_SYSTEM_PROMPT,
-        add_history_to_context=False,  # Each call is independent
+        add_history_to_context=True,
     )
 
 def summarize_conversation(messages: List[Dict[str, Any]]) -> MessageSummary:
diff --git a/main_support_agent_claude.py b/main_support_agent_claude.py
new file mode 100644
index 0000000..61fdb79
--- /dev/null
+++ b/main_support_agent_claude.py
@@ -0,0 +1,230 @@
+"""
+Main Bank Customer Support Agent - Claude Sonnet 4.5 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.anthropic import Claude
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Claude(id="claude-opus-4-6", api_key=os.getenv("ANTHROPIC_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Claude(
+        id="claude-opus-4-6",
+        api_key=os.getenv("ANTHROPIC_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,
+    num_history_runs=100,
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_deepseek.py b/main_support_agent_deepseek.py
new file mode 100644
index 0000000..d57e791
--- /dev/null
+++ b/main_support_agent_deepseek.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - DeepSeek V3.2 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="deepseek-ai/DeepSeek-V3.2", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="deepseek-ai/DeepSeek-V3.2",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_glm.py b/main_support_agent_glm.py
new file mode 100644
index 0000000..2440441
--- /dev/null
+++ b/main_support_agent_glm.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - GLM-4.7-FP8 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="zai-org/GLM-4.7-FP8", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="zai-org/GLM-4.7-FP8",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_minimax.py b/main_support_agent_minimax.py
new file mode 100644
index 0000000..55a05fe
--- /dev/null
+++ b/main_support_agent_minimax.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - MiniMax-M2.1 Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="MiniMaxAI/MiniMax-M2.1", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="MiniMaxAI/MiniMax-M2.1",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/main_support_agent_openai.py b/main_support_agent_openai.py
new file mode 100644
index 0000000..259167c
--- /dev/null
+++ b/main_support_agent_openai.py
@@ -0,0 +1,229 @@
+"""
+Main Bank Customer Support Agent - OpenAI GPT-OSS-120B Model
+
+This is the production code - kept very simple. One agent with tools, Agno handles memory.
+"""
+
+import os
+import json
+from typing import Dict, Any
+import dotenv
+from agno.agent import Agent
+from agno.models.nebius import Nebius
+
+import agent_config
+
+dotenv.load_dotenv()
+
+agent_config.set_model(Nebius(id="openai/gpt-oss-120b", api_key=os.getenv("NEBIUS_API_KEY")))
+
+# Import our specialized agents as tools
+from agents.summary_agent import summarize_conversation
+from agents.next_message_agent import suggest_next_message
+from agents.customer_explorer_agent import (
+    explore_customer_context,
+    analyze_customer_behavior,
+)
+
+import langwatch
+from openinference.instrumentation.agno import AgnoInstrumentor
+
+langwatch.setup(instrumentors=[AgnoInstrumentor()])
+
+SYSTEM_PROMPT = """
+You are a customer support agent for SecureBank, a modern digital banking platform.
+
+Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations:
+
+TOOL USAGE REQUIREMENTS:
+
+1. **explore_customer_account** - ALWAYS use when:
+   - Customer mentions fraud, unauthorized transactions, or security concerns
+   - Customer asks about spending patterns, budgeting, or financial analysis
+   - Customer needs account-specific insights or personalized recommendations
+   - Any urgent business account issues that need immediate investigation
+
+2. **get_message_suggestion** - ALWAYS use when:
+   - Customer has complex, multi-part problems (locked accounts + fees + missing deposits)
+   - You need guidance on complex banking regulations or procedures
+   - Customer issue involves multiple interconnected banking services
+
+3. **escalate_to_human** - ALWAYS use when:
+   - Customer explicitly demands to speak with a manager, supervisor, or human agent
+   - Customer expresses extreme frustration or dissatisfaction
+   - Business customer has urgent issues affecting operations (payroll, employee payments)
+   - Set urgency to "high" for business-critical issues
+
+4. **get_conversation_summary** - Use when:
+   - Customer asks you to summarize the conversation
+   - You need to analyze conversation patterns or sentiment
+
+CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly.
+
+Guidelines:
+- Be helpful, professional, and empathetic
+- Use tools proactively based on the requirements above
+- Provide clear, actionable solutions
+- Always prioritize customer security and privacy
+
+Remember: Tool usage is not optional when the situation matches the requirements above.
+"""
+
+
+def get_conversation_summary(conversation_context: str = "recent messages") -> str:
+    """
+    Analyze the conversation for patterns, sentiment, and key issues
+
+    Args:
+        conversation_context: Context about what to analyze
+
+    Returns:
+        JSON string with conversation analysis
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_conversation_summary"]}
+    )
+    # In a real implementation, this would get the actual conversation history
+    # For now, we'll simulate with a basic response
+    return json.dumps(
+        {
+            "summary": "Conversation analysis requested",
+            "sentiment": "neutral",
+            "key_issues": ["general inquiry"],
+            "suggested_actions": ["continue conversation"],
+        }
+    )
+
+
+def get_message_suggestion(customer_query: str, context: str = "") -> str:
+    """
+    Get suggestions for responding to customer queries using knowledge base
+
+    Args:
+        customer_query: The customer's question or concern
+        context: Additional context about the conversation
+
+    Returns:
+        JSON string with response suggestions
+    """
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_get_message_suggestion"]}
+    )
+    # Simulate knowledge base lookup
+    suggestion_data = {
+        "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.",
+        "confidence": "medium",
+        "knowledge_sources": ["general_banking_guide"],
+        "alternatives": ["Ask for more details", "Escalate to specialist"],
+    }
+    return json.dumps(suggestion_data)
+
+
+def explore_customer_account(customer_id: str, query: str) -> str:
+    """
+    Explore customer account data and provide rich insights
+
+    Args:
+        customer_id: Customer identifier (e.g., CUST_001)
+        query: What to explore about the customer
+
+    Returns:
+        JSON string with customer insights and rich experiences
+    """
+    # Get customer behavior analysis
+    behavior = analyze_customer_behavior(customer_id)
+
+    # Get rich experiences based on query
+    rich_experiences = explore_customer_context(customer_id, query)
+
+    langwatch.get_current_trace().update(
+        metadata={"labels": ["tool_explore_customer_account"]}
+    )
+
+    return json.dumps(
+        {
+            "customer_behavior": behavior,
+            "rich_experiences": [
+                {
+                    "type": exp.component_type,
+                    "title": exp.title,
+                    "data": exp.data,
+                    "actions": exp.actions,
+                    "priority": exp.priority,
+                }
+                for exp in rich_experiences
+            ],
+        }
+    )
+
+
+def escalate_to_human(reason: str, urgency: str = "medium") -> str:
+    """
+    Escalate the conversation to a human agent
+
+    Args:
+        reason: Why the escalation is needed
+        urgency: Priority level (low, medium, high)
+
+    Returns:
+        JSON string with escalation details
+    """
+    langwatch.get_current_trace().update(metadata={"labels": ["escalation"]})
+
+    escalation_data = {
+        "escalated": True,
+        "reason": reason,
+        "urgency": urgency,
+        "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes",
+        "message": "I'm connecting you with a specialist who can provide additional assistance.",
+    }
+    return json.dumps(escalation_data)
+
+
+# Create the main support agent
+support_agent = Agent(
+    name="BankCustomerSupportAgent",
+    model=Nebius(
+        id="openai/gpt-oss-120b",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+    ),
+    tools=[
+        get_conversation_summary,
+        get_message_suggestion,
+        explore_customer_account,
+        escalate_to_human,
+    ],
+    description=SYSTEM_PROMPT,
+    add_history_to_context=True,  # Let Agno handle memory
+)
+
+
+# Simple interface for testing
+def chat_with_agent(message: str) -> str:
+    """Simple interface to chat with the agent"""
+    response = support_agent.run(message)
+    return response.content
+
+
+# Example usage
+if __name__ == "__main__":
+    print("=== Bank Customer Support Agent ===")
+    print(
+        "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?"
+    )
+
+    # Simulate a conversation
+    customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud."
+    print(f"\nCustomer: {customer_message}")
+
+    response = chat_with_agent(customer_message)
+    print(f"Agent: {response}")
+
+    # Continue conversation
+    customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?"
+    print(f"\nCustomer: {customer_message2}")
+
+    response2 = chat_with_agent(customer_message2)
+    print(f"Agent: {response2}")
+
+    print("\n=== Conversation Complete ===")
diff --git a/tests-demo/test_demo_claude.py b/tests-demo/test_demo_claude.py
index 61ab54a..d21b33c 100644
--- a/tests-demo/test_demo_claude.py
+++ b/tests-demo/test_demo_claude.py
@@ -15,45 +15,81 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 import scenario
-from main_support_agent import support_agent
+from main_support_agent_claude import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="claude-sonnet-4.5")
+scenario.configure(default_model="openai/gpt-4o")
 
 
-class BankSupportAgentAdapter(scenario.AgentAdapter):
-    """Adapter for our main bank support agent"""
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
 
-    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
-        message_content = input.last_new_user_message_str()
-        response = support_agent.run(message_content)
 
-        # Convert Agno messages to OpenAI format for Scenario
-        openai_messages = []
-        for message in response.messages or []:
-            if message.role in ["assistant", "user", "system", "tool"]:
-                msg_dict = {"role": message.role, "content": message.content}
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
 
-                # Add tool calls if present (for assistant messages)
-                if message.tool_calls:
-                    msg_dict["tool_calls"] = message.tool_calls
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
 
-                # Add tool call ID if present (for tool messages)
-                if hasattr(message, "tool_call_id") and message.tool_call_id:
-                    msg_dict["tool_call_id"] = message.tool_call_id
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
 
-                openai_messages.append(msg_dict)
+    return messages
 
-        # Return all messages except system and user (Scenario manages the conversation flow)
-        # We need to include tool messages to satisfy OpenAI's requirements
-        relevant_messages = [
-            msg for msg in openai_messages if msg["role"] in ["assistant", "tool"]
-        ]
 
-        if relevant_messages:
-            return relevant_messages
+class BankSupportAgentAdapter(scenario.AgentAdapter):
+    """Adapter for our main bank support agent"""
+
+    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+        message_content = input.last_new_user_message_str()
+        response = support_agent.run(message_content)
+
+        # Use synthetic tool trace messages — these properly pair tool calls
+        # with their results, avoiding missing tool_call_id issues with
+        # Claude's toolu_* IDs
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
 
-        # Fallback to content if no relevant messages found
+        # Fallback to content if no tool calls
         return response.content  # type: ignore
 
 
@@ -69,15 +105,16 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
                     "Agent offers concrete security actions like card freezing or blocking",
                     "Agent provides clear next steps for fraud investigation and dispute process",
                     "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
                 ]
             ),
         ],
@@ -98,7 +135,7 @@ async def test_fraud_investigation_workflow():
         ],
     )
 
-    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Fraud investigation test")
 
 
 @pytest.mark.agent_test
@@ -112,7 +149,7 @@ def check_escalation_called(state: scenario.ScenarioState):
 
         tool_call = state.last_tool_call("escalate_to_human")
         if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
+            args = _parse_tool_arguments(tool_call)
             reason = args.get("reason", "").lower()
             assert any(
                 keyword in reason
@@ -128,9 +165,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -153,7 +190,7 @@ def check_escalation_called(state: scenario.ScenarioState):
         ],
     )
 
-    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Escalation test")
 
 
 @pytest.mark.agent_test
@@ -168,9 +205,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -193,9 +230,7 @@ async def test_complex_issue_triggers_knowledge_base():
         ],
     )
 
-    assert (
-        result.success
-    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+    _assert_success(result, "Complex issue test")
 
 
 @pytest.mark.agent_test
@@ -210,9 +245,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
@@ -235,7 +270,157 @@ async def test_urgent_business_scenario():
         ],
     )
 
-    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - Claude",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - Claude",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - Claude",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - Claude",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
 
 
 if __name__ == "__main__":
diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py
index 6f7572d..6133a40 100644
--- a/tests-demo/test_demo_deepseek.py
+++ b/tests-demo/test_demo_deepseek.py
@@ -15,10 +15,64 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 import scenario
-from main_support_agent import support_agent
+from main_support_agent_deepseek import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="nebius/deepseek-ai/DeepSeek-V3.2")
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):
@@ -51,7 +105,15 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         ]
 
         if relevant_messages:
-            return relevant_messages
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
 
         # Fallback to content if no relevant messages found
         return response.content  # type: ignore
@@ -68,15 +130,16 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
                     "Agent offers concrete security actions like card freezing or blocking",
                     "Agent provides clear next steps for fraud investigation and dispute process",
                     "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
                 ]
             ),
         ],
@@ -97,7 +160,7 @@ async def test_fraud_investigation_workflow():
         ],
     )
 
-    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Fraud investigation test")
 
 @pytest.mark.agent_test
 @pytest.mark.asyncio
@@ -110,7 +173,7 @@ def check_escalation_called(state: scenario.ScenarioState):
 
         tool_call = state.last_tool_call("escalate_to_human")
         if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
+            args = _parse_tool_arguments(tool_call)
             reason = args.get("reason", "").lower()
             assert any(
                 keyword in reason
@@ -126,9 +189,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -151,7 +214,7 @@ def check_escalation_called(state: scenario.ScenarioState):
         ],
     )
 
-    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Escalation test")
 
 
 @pytest.mark.agent_test
@@ -166,9 +229,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -191,9 +254,7 @@ async def test_complex_issue_triggers_knowledge_base():
         ],
     )
 
-    assert (
-        result.success
-    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+    _assert_success(result, "Complex issue test")
 
 
 @pytest.mark.agent_test
@@ -208,9 +269,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
@@ -233,7 +294,157 @@ async def test_urgent_business_scenario():
         ],
     )
 
-    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - DeepSeek",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - DeepSeek",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - DeepSeek",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - DeepSeek",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
 
 
 if __name__ == "__main__":
diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py
index f54f7a3..056e782 100644
--- a/tests-demo/test_demo_glm.py
+++ b/tests-demo/test_demo_glm.py
@@ -15,10 +15,64 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 import scenario
-from main_support_agent import support_agent
+from main_support_agent_glm import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="nebius/zai-org/GLM-4.7-FP8")
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):
@@ -51,7 +105,15 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         ]
 
         if relevant_messages:
-            return relevant_messages
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
 
         # Fallback to content if no relevant messages found
         return response.content  # type: ignore
@@ -69,15 +131,16 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
                     "Agent offers concrete security actions like card freezing or blocking",
                     "Agent provides clear next steps for fraud investigation and dispute process",
                     "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
                 ]
             ),
         ],
@@ -98,7 +161,7 @@ async def test_fraud_investigation_workflow():
         ],
     )
 
-    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Fraud investigation test")
 
 
 @pytest.mark.agent_test
@@ -112,7 +175,7 @@ def check_escalation_called(state: scenario.ScenarioState):
 
         tool_call = state.last_tool_call("escalate_to_human")
         if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
+            args = _parse_tool_arguments(tool_call)
             reason = args.get("reason", "").lower()
             assert any(
                 keyword in reason
@@ -128,9 +191,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -153,7 +216,47 @@ def check_escalation_called(state: scenario.ScenarioState):
         ],
     )
 
-    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Escalation test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_complex_issue_triggers_knowledge_base():
+    result = await scenario.run(
+        name="complex multi-issue banking problem - GLM",
+        description="""
+            Customer has multiple interconnected banking problems: locked online banking,
+            unexpected fees, and missing direct deposit. They need systematic help.
+            This tests whether the agent can handle multiple issues comprehensively.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
+                    "Agent provides systematic approach with clear steps for each issue",
+                    "Agent shows empathy for customer's stress and urgency",
+                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
+                    "Agent offers concrete next steps that the customer can act on",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Complex issue test")
 
 
 @pytest.mark.agent_test
@@ -168,9 +271,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
@@ -193,49 +296,157 @@ async def test_urgent_business_scenario():
         ],
     )
 
-    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Urgent business test")
 
 
 @pytest.mark.agent_test
 @pytest.mark.asyncio
-async def test_complex_issue_triggers_knowledge_base():
+async def test_simple_inquiry_no_tools():
     result = await scenario.run(
-        name="complex multi-issue banking problem - GLM",
+        name="simple inquiry without tool usage - GLM",
         description="""
-            Customer has multiple interconnected banking problems: locked online banking,
-            unexpected fees, and missing direct deposit. They need systematic help.
-            This tests whether the agent can handle multiple issues comprehensively.
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
-                    "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
-                    "Agent provides systematic approach with clear steps for each issue",
-                    "Agent shows empathy for customer's stress and urgency",
-                    "Agent prioritizes the most urgent issue (locked account for bill payments)",
-                    "Agent offers concrete next steps that the customer can act on",
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
                 ]
             ),
         ],
         script=[
             scenario.user(
-                "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001."
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
             ),
             scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - GLM",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
             scenario.user(
-                "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out."
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - GLM",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - GLM",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
             ),
             scenario.agent(),
             scenario.judge(),
         ],
     )
 
-    assert (
-        result.success
-    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+    _assert_success(result, "Overdraft fee dispute test")
 
 
 if __name__ == "__main__":
diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py
index 2cd2da7..f2d481e 100644
--- a/tests-demo/test_demo_minimax.py
+++ b/tests-demo/test_demo_minimax.py
@@ -15,10 +15,64 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 import scenario
-from main_support_agent import support_agent
+from main_support_agent_minimax import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="nebius/MiniMaxAI/MiniMax-M2.1")
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):
@@ -51,7 +105,15 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         ]
 
         if relevant_messages:
-            return relevant_messages
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
 
         # Fallback to content if no relevant messages found
         return response.content  # type: ignore
@@ -69,15 +131,16 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
                     "Agent offers concrete security actions like card freezing or blocking",
                     "Agent provides clear next steps for fraud investigation and dispute process",
                     "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
                 ]
             ),
         ],
@@ -98,7 +161,7 @@ async def test_fraud_investigation_workflow():
         ],
     )
 
-    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Fraud investigation test")
 
 
 @pytest.mark.agent_test
@@ -112,7 +175,7 @@ def check_escalation_called(state: scenario.ScenarioState):
 
         tool_call = state.last_tool_call("escalate_to_human")
         if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
+            args = _parse_tool_arguments(tool_call)
             reason = args.get("reason", "").lower()
             assert any(
                 keyword in reason
@@ -128,9 +191,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -153,7 +216,7 @@ def check_escalation_called(state: scenario.ScenarioState):
         ],
     )
 
-    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Escalation test")
 
 
 @pytest.mark.agent_test
@@ -168,9 +231,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -193,9 +256,7 @@ async def test_complex_issue_triggers_knowledge_base():
         ],
     )
 
-    assert (
-        result.success
-    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+    _assert_success(result, "Complex issue test")
 
 
 @pytest.mark.agent_test
@@ -210,9 +271,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
@@ -235,7 +296,157 @@ async def test_urgent_business_scenario():
         ],
     )
 
-    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - MiniMax",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - MiniMax",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - MiniMax",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - MiniMax",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
 
 
 if __name__ == "__main__":
diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py
index e61f3b5..198175d 100644
--- a/tests-demo/test_demo_openai.py
+++ b/tests-demo/test_demo_openai.py
@@ -2,7 +2,7 @@
 Tests for the main bank customer support agent - OpenAI Model
 
 These tests cover real business scenarios and validate tool calling behavior
-using OpenAI claude-sonnet-4.5-mini model for evaluation.
+using OpenAI claude-sonnet-4-5-mini model for evaluation.
 """
 import asyncio
 import pytest
@@ -15,10 +15,64 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 import scenario
-from main_support_agent import support_agent
+from main_support_agent_openai import support_agent
 
 dotenv.load_dotenv()
-scenario.configure(default_model="openai/gpt-oss-120b")
+scenario.configure(default_model="openai/gpt-4o")
+
+
+def _parse_tool_arguments(tool_call: dict) -> dict:
+    raw_args = tool_call["function"].get("arguments", {})
+    if isinstance(raw_args, dict):
+        return raw_args
+    if isinstance(raw_args, str):
+        try:
+            return json.loads(raw_args)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None:
+    assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}"
+
+
+def _build_tool_trace_messages(response) -> list[dict]:
+    messages: list[dict] = []
+    for i, tool in enumerate(response.tools or []):
+        tool_call_id = tool.tool_call_id or f"tool_call_{i}"
+        tool_name = tool.tool_name or "unknown_tool"
+        tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {}
+        tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {})
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": json.dumps(tool_args),
+                        },
+                    }
+                ],
+            }
+        )
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "content": tool_result,
+            }
+        )
+
+    if isinstance(response.content, str) and response.content:
+        messages.append({"role": "assistant", "content": response.content})
+
+    return messages
 
 
 class BankSupportAgentAdapter(scenario.AgentAdapter):
@@ -51,7 +105,15 @@ async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
         ]
 
         if relevant_messages:
-            return relevant_messages
+            has_tool_calls = any(
+                msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages
+            )
+            if has_tool_calls:
+                return relevant_messages
+
+        synthetic_messages = _build_tool_trace_messages(response)
+        if synthetic_messages:
+            return synthetic_messages
 
         # Fallback to content if no relevant messages found
         return response.content  # type: ignore
@@ -69,15 +131,16 @@ async def test_fraud_investigation_workflow():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent takes fraud concerns seriously and responds with urgency",
                     "Agent gathers necessary information (account details) to investigate",
                     "Agent offers concrete security actions like card freezing or blocking",
                     "Agent provides clear next steps for fraud investigation and dispute process",
                     "Agent maintains professional and reassuring tone throughout",
+                    "Agent does not re-ask for customer ID that was already provided",
                 ]
             ),
         ],
@@ -98,7 +161,7 @@ async def test_fraud_investigation_workflow():
         ],
     )
 
-    assert result.success, f"Fraud investigation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Fraud investigation test")
 
 
 @pytest.mark.agent_test
@@ -112,7 +175,7 @@ def check_escalation_called(state: scenario.ScenarioState):
 
         tool_call = state.last_tool_call("escalate_to_human")
         if tool_call:
-            args = json.loads(tool_call["function"]["arguments"])
+            args = _parse_tool_arguments(tool_call)
             reason = args.get("reason", "").lower()
             assert any(
                 keyword in reason
@@ -128,9 +191,9 @@ def check_escalation_called(state: scenario.ScenarioState):
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges customer's frustration empathetically",
                     "Agent offers to escalate when requested",
@@ -153,7 +216,7 @@ def check_escalation_called(state: scenario.ScenarioState):
         ],
     )
 
-    assert result.success, f"Escalation test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Escalation test")
 
 
 @pytest.mark.agent_test
@@ -168,9 +231,9 @@ async def test_complex_issue_triggers_knowledge_base():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)",
                     "Agent provides systematic approach with clear steps for each issue",
@@ -193,9 +256,7 @@ async def test_complex_issue_triggers_knowledge_base():
         ],
     )
 
-    assert (
-        result.success
-    ), f"Complex issue test failed: {result.reasoning if hasattr(result, 'reasoning') else 'No failure reason available'}"
+    _assert_success(result, "Complex issue test")
 
 
 @pytest.mark.agent_test
@@ -210,9 +271,9 @@ async def test_urgent_business_scenario():
         """,
         agents=[
             BankSupportAgentAdapter(),
-            scenario.UserSimulatorAgent(model="claude-sonnet-4.5"),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
             scenario.JudgeAgent(
-                model="claude-sonnet-4.5",
+                model="openai/gpt-4o",
                 criteria=[
                     "Agent immediately recognizes the business urgency and employee impact",
                     "Agent responds with high priority and urgency in tone",
@@ -235,7 +296,157 @@ async def test_urgent_business_scenario():
         ],
     )
 
-    assert result.success, f"Urgent business test failed: {result.failure_reason}"  # type: ignore
+    _assert_success(result, "Urgent business test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_simple_inquiry_no_tools():
+    result = await scenario.run(
+        name="simple inquiry without tool usage - OpenAI",
+        description="""
+            Customer asks a simple question about branch hours or general banking info.
+            The agent should answer directly without invoking any tools.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent answers the simple question directly and helpfully",
+                    "Agent does not over-complicate the response",
+                    "Agent maintains a friendly and professional tone",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "What are your customer support hours? I just want to know when I can call if I have an issue."
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Simple inquiry test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_spending_analysis_request():
+    result = await scenario.run(
+        name="spending analysis and budgeting help - OpenAI",
+        description="""
+            Customer wants to understand their spending patterns and get budgeting advice.
+            The agent should use explore_customer_account to analyze their transactions.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent uses account exploration tools to analyze spending",
+                    "Agent provides specific insights about spending categories",
+                    "Agent offers actionable budgeting advice or recommendations",
+                    "Agent is helpful and non-judgmental about spending habits",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?"
+            ),
+            scenario.agent(),
+            scenario.user(
+                "That's really helpful. Are there any areas where you think I could cut back?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Spending analysis test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_lost_card_replacement():
+    result = await scenario.run(
+        name="lost card replacement workflow - OpenAI",
+        description="""
+            Customer has lost their debit card and needs a replacement.
+            Tests whether the agent handles the card replacement process properly
+            including immediate security measures.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent treats lost card with appropriate urgency",
+                    "Agent suggests freezing or blocking the lost card immediately",
+                    "Agent explains the replacement card process and timeline",
+                    "Agent asks about any unauthorized transactions since the card was lost",
+                    "Agent reassures the customer about account security",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Lost card replacement test")
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+async def test_overdraft_fee_dispute():
+    result = await scenario.run(
+        name="overdraft fee dispute and resolution - OpenAI",
+        description="""
+            Customer with a basic checking account notices an overdraft fee and wants
+            it reversed. Tests empathy, account investigation, and fee resolution.
+        """,
+        agents=[
+            BankSupportAgentAdapter(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4o"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4o",
+                criteria=[
+                    "Agent shows empathy for the customer's frustration about the fee",
+                    "Agent investigates the account to understand the overdraft situation",
+                    "Agent explains how the overdraft fee occurred",
+                    "Agent offers a resolution path (fee waiver, escalation, or explanation)",
+                    "Agent suggests ways to avoid future overdraft fees",
+                ]
+            ),
+        ],
+        script=[
+            scenario.user(
+                "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002."
+            ),
+            scenario.agent(),
+            scenario.user(
+                "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?"
+            ),
+            scenario.agent(),
+            scenario.judge(),
+        ],
+    )
+
+    _assert_success(result, "Overdraft fee dispute test")
 
 
 if __name__ == "__main__":