diff --git a/agents/customer_explorer_agent.py b/agents/customer_explorer_agent.py index a1d8adc..e1d5ab3 100644 --- a/agents/customer_explorer_agent.py +++ b/agents/customer_explorer_agent.py @@ -6,9 +6,11 @@ import dotenv from agno.agent import Agent from agno.models.openai import OpenAIChat +from agno.models.nebius import Nebius from pydantic import BaseModel from datetime import datetime, timedelta import json +import agent_config dotenv.load_dotenv() @@ -116,12 +118,9 @@ def create_customer_explorer_agent() -> Agent: """Create and return the customer explorer agent""" return Agent( name="CustomerExplorerAgent", - model=OpenAIChat( - id="gpt-4o-mini", - api_key=os.getenv("OPENAI_API_KEY"), - ), + model=agent_config.get_model(), description=CUSTOMER_EXPLORER_SYSTEM_PROMPT, - add_history_to_context=False, + add_history_to_context=True, ) def explore_customer_context( diff --git a/agents/next_message_agent.py b/agents/next_message_agent.py index 90e956d..edaa70d 100644 --- a/agents/next_message_agent.py +++ b/agents/next_message_agent.py @@ -6,7 +6,9 @@ import dotenv from agno.agent import Agent from agno.models.openai import OpenAIChat +from agno.models.nebius import Nebius from pydantic import BaseModel +import agent_config dotenv.load_dotenv() @@ -131,12 +133,9 @@ def create_next_message_agent() -> Agent: """Create and return the next message agent""" return Agent( name="NextMessageAgent", - model=OpenAIChat( - id="gpt-4o-mini", - api_key=os.getenv("OPENAI_API_KEY"), - ), + model=agent_config.get_model(), description=NEXT_MESSAGE_SYSTEM_PROMPT, - add_history_to_context=False, + add_history_to_context=True, ) def suggest_next_message( diff --git a/agents/summary_agent.py b/agents/summary_agent.py index 272871b..3e4b46a 100644 --- a/agents/summary_agent.py +++ b/agents/summary_agent.py @@ -6,7 +6,9 @@ import dotenv from agno.agent import Agent from agno.models.openai import OpenAIChat +from agno.models.nebius import Nebius from pydantic import BaseModel +import agent_config dotenv.load_dotenv() @@ -46,12 +48,9 @@ def create_summary_agent() -> Agent: """Create and return the summary agent""" return Agent( name="SummaryAgent", - model=OpenAIChat( - id="gpt-4o-mini", - api_key=os.getenv("OPENAI_API_KEY"), - ), + model=agent_config.get_model(), description=SUMMARY_SYSTEM_PROMPT, - add_history_to_context=False, # Each call is independent + add_history_to_context=True, ) def summarize_conversation(messages: List[Dict[str, Any]]) -> MessageSummary: diff --git a/main_support_agent.py b/main_support_agent.py index faa8a26..c21ddd4 100644 --- a/main_support_agent.py +++ b/main_support_agent.py @@ -10,6 +10,7 @@ import dotenv from agno.agent import Agent from agno.models.openai import OpenAIChat +from agno.models.nebius import Nebius # Import our specialized agents as tools from agents.summary_agent import summarize_conversation @@ -179,9 +180,13 @@ def escalate_to_human(reason: str, urgency: str = "medium") -> str: # Create the main support agent support_agent = Agent( name="BankCustomerSupportAgent", - model=OpenAIChat( - id="gpt-4o-mini", - api_key=os.getenv("OPENAI_API_KEY"), + # model=OpenAIChat( + # id="gpt-4o-mini", + # api_key=os.getenv("OPENAI_API_KEY"), + # ), + model=Nebius( + id="openai/gpt-oss-120b", + api_key=os.getenv("NEBIUS_API_KEY"), ), tools=[ get_conversation_summary, diff --git a/main_support_agent_claude.py b/main_support_agent_claude.py new file mode 100644 index 0000000..61fdb79 --- /dev/null +++ b/main_support_agent_claude.py @@ -0,0 +1,230 @@ +""" +Main Bank Customer Support Agent - Claude Sonnet 4.5 Model + +This is the production code - kept very simple. One agent with tools, Agno handles memory. +""" + +import os +import json +from typing import Dict, Any +import dotenv +from agno.agent import Agent +from agno.models.anthropic import Claude + +import agent_config + +dotenv.load_dotenv() + +agent_config.set_model(Claude(id="claude-opus-4-6", api_key=os.getenv("ANTHROPIC_API_KEY"))) + +# Import our specialized agents as tools +from agents.summary_agent import summarize_conversation +from agents.next_message_agent import suggest_next_message +from agents.customer_explorer_agent import ( + explore_customer_context, + analyze_customer_behavior, +) + +import langwatch +from openinference.instrumentation.agno import AgnoInstrumentor + +langwatch.setup(instrumentors=[AgnoInstrumentor()]) + +SYSTEM_PROMPT = """ +You are a customer support agent for SecureBank, a modern digital banking platform. + +Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations: + +TOOL USAGE REQUIREMENTS: + +1. **explore_customer_account** - ALWAYS use when: + - Customer mentions fraud, unauthorized transactions, or security concerns + - Customer asks about spending patterns, budgeting, or financial analysis + - Customer needs account-specific insights or personalized recommendations + - Any urgent business account issues that need immediate investigation + +2. **get_message_suggestion** - ALWAYS use when: + - Customer has complex, multi-part problems (locked accounts + fees + missing deposits) + - You need guidance on complex banking regulations or procedures + - Customer issue involves multiple interconnected banking services + +3. **escalate_to_human** - ALWAYS use when: + - Customer explicitly demands to speak with a manager, supervisor, or human agent + - Customer expresses extreme frustration or dissatisfaction + - Business customer has urgent issues affecting operations (payroll, employee payments) + - Set urgency to "high" for business-critical issues + +4. **get_conversation_summary** - Use when: + - Customer asks you to summarize the conversation + - You need to analyze conversation patterns or sentiment + +CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly. + +Guidelines: +- Be helpful, professional, and empathetic +- Use tools proactively based on the requirements above +- Provide clear, actionable solutions +- Always prioritize customer security and privacy + +Remember: Tool usage is not optional when the situation matches the requirements above. +""" + + +def get_conversation_summary(conversation_context: str = "recent messages") -> str: + """ + Analyze the conversation for patterns, sentiment, and key issues + + Args: + conversation_context: Context about what to analyze + + Returns: + JSON string with conversation analysis + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_conversation_summary"]} + ) + # In a real implementation, this would get the actual conversation history + # For now, we'll simulate with a basic response + return json.dumps( + { + "summary": "Conversation analysis requested", + "sentiment": "neutral", + "key_issues": ["general inquiry"], + "suggested_actions": ["continue conversation"], + } + ) + + +def get_message_suggestion(customer_query: str, context: str = "") -> str: + """ + Get suggestions for responding to customer queries using knowledge base + + Args: + customer_query: The customer's question or concern + context: Additional context about the conversation + + Returns: + JSON string with response suggestions + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_message_suggestion"]} + ) + # Simulate knowledge base lookup + suggestion_data = { + "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.", + "confidence": "medium", + "knowledge_sources": ["general_banking_guide"], + "alternatives": ["Ask for more details", "Escalate to specialist"], + } + return json.dumps(suggestion_data) + + +def explore_customer_account(customer_id: str, query: str) -> str: + """ + Explore customer account data and provide rich insights + + Args: + customer_id: Customer identifier (e.g., CUST_001) + query: What to explore about the customer + + Returns: + JSON string with customer insights and rich experiences + """ + # Get customer behavior analysis + behavior = analyze_customer_behavior(customer_id) + + # Get rich experiences based on query + rich_experiences = explore_customer_context(customer_id, query) + + langwatch.get_current_trace().update( + metadata={"labels": ["tool_explore_customer_account"]} + ) + + return json.dumps( + { + "customer_behavior": behavior, + "rich_experiences": [ + { + "type": exp.component_type, + "title": exp.title, + "data": exp.data, + "actions": exp.actions, + "priority": exp.priority, + } + for exp in rich_experiences + ], + } + ) + + +def escalate_to_human(reason: str, urgency: str = "medium") -> str: + """ + Escalate the conversation to a human agent + + Args: + reason: Why the escalation is needed + urgency: Priority level (low, medium, high) + + Returns: + JSON string with escalation details + """ + langwatch.get_current_trace().update(metadata={"labels": ["escalation"]}) + + escalation_data = { + "escalated": True, + "reason": reason, + "urgency": urgency, + "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes", + "message": "I'm connecting you with a specialist who can provide additional assistance.", + } + return json.dumps(escalation_data) + + +# Create the main support agent +support_agent = Agent( + name="BankCustomerSupportAgent", + model=Claude( + id="claude-opus-4-6", + api_key=os.getenv("ANTHROPIC_API_KEY"), + ), + tools=[ + get_conversation_summary, + get_message_suggestion, + explore_customer_account, + escalate_to_human, + ], + description=SYSTEM_PROMPT, + add_history_to_context=True, + num_history_runs=100, +) + + +# Simple interface for testing +def chat_with_agent(message: str) -> str: + """Simple interface to chat with the agent""" + response = support_agent.run(message) + return response.content + + +# Example usage +if __name__ == "__main__": + print("=== Bank Customer Support Agent ===") + print( + "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?" + ) + + # Simulate a conversation + customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud." + print(f"\nCustomer: {customer_message}") + + response = chat_with_agent(customer_message) + print(f"Agent: {response}") + + # Continue conversation + customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?" + print(f"\nCustomer: {customer_message2}") + + response2 = chat_with_agent(customer_message2) + print(f"Agent: {response2}") + + print("\n=== Conversation Complete ===") diff --git a/main_support_agent_deepseek.py b/main_support_agent_deepseek.py new file mode 100644 index 0000000..d57e791 --- /dev/null +++ b/main_support_agent_deepseek.py @@ -0,0 +1,229 @@ +""" +Main Bank Customer Support Agent - DeepSeek V3.2 Model + +This is the production code - kept very simple. One agent with tools, Agno handles memory. +""" + +import os +import json +from typing import Dict, Any +import dotenv +from agno.agent import Agent +from agno.models.nebius import Nebius + +import agent_config + +dotenv.load_dotenv() + +agent_config.set_model(Nebius(id="deepseek-ai/DeepSeek-V3.2", api_key=os.getenv("NEBIUS_API_KEY"))) + +# Import our specialized agents as tools +from agents.summary_agent import summarize_conversation +from agents.next_message_agent import suggest_next_message +from agents.customer_explorer_agent import ( + explore_customer_context, + analyze_customer_behavior, +) + +import langwatch +from openinference.instrumentation.agno import AgnoInstrumentor + +langwatch.setup(instrumentors=[AgnoInstrumentor()]) + +SYSTEM_PROMPT = """ +You are a customer support agent for SecureBank, a modern digital banking platform. + +Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations: + +TOOL USAGE REQUIREMENTS: + +1. **explore_customer_account** - ALWAYS use when: + - Customer mentions fraud, unauthorized transactions, or security concerns + - Customer asks about spending patterns, budgeting, or financial analysis + - Customer needs account-specific insights or personalized recommendations + - Any urgent business account issues that need immediate investigation + +2. **get_message_suggestion** - ALWAYS use when: + - Customer has complex, multi-part problems (locked accounts + fees + missing deposits) + - You need guidance on complex banking regulations or procedures + - Customer issue involves multiple interconnected banking services + +3. **escalate_to_human** - ALWAYS use when: + - Customer explicitly demands to speak with a manager, supervisor, or human agent + - Customer expresses extreme frustration or dissatisfaction + - Business customer has urgent issues affecting operations (payroll, employee payments) + - Set urgency to "high" for business-critical issues + +4. **get_conversation_summary** - Use when: + - Customer asks you to summarize the conversation + - You need to analyze conversation patterns or sentiment + +CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly. + +Guidelines: +- Be helpful, professional, and empathetic +- Use tools proactively based on the requirements above +- Provide clear, actionable solutions +- Always prioritize customer security and privacy + +Remember: Tool usage is not optional when the situation matches the requirements above. +""" + + +def get_conversation_summary(conversation_context: str = "recent messages") -> str: + """ + Analyze the conversation for patterns, sentiment, and key issues + + Args: + conversation_context: Context about what to analyze + + Returns: + JSON string with conversation analysis + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_conversation_summary"]} + ) + # In a real implementation, this would get the actual conversation history + # For now, we'll simulate with a basic response + return json.dumps( + { + "summary": "Conversation analysis requested", + "sentiment": "neutral", + "key_issues": ["general inquiry"], + "suggested_actions": ["continue conversation"], + } + ) + + +def get_message_suggestion(customer_query: str, context: str = "") -> str: + """ + Get suggestions for responding to customer queries using knowledge base + + Args: + customer_query: The customer's question or concern + context: Additional context about the conversation + + Returns: + JSON string with response suggestions + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_message_suggestion"]} + ) + # Simulate knowledge base lookup + suggestion_data = { + "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.", + "confidence": "medium", + "knowledge_sources": ["general_banking_guide"], + "alternatives": ["Ask for more details", "Escalate to specialist"], + } + return json.dumps(suggestion_data) + + +def explore_customer_account(customer_id: str, query: str) -> str: + """ + Explore customer account data and provide rich insights + + Args: + customer_id: Customer identifier (e.g., CUST_001) + query: What to explore about the customer + + Returns: + JSON string with customer insights and rich experiences + """ + # Get customer behavior analysis + behavior = analyze_customer_behavior(customer_id) + + # Get rich experiences based on query + rich_experiences = explore_customer_context(customer_id, query) + + langwatch.get_current_trace().update( + metadata={"labels": ["tool_explore_customer_account"]} + ) + + return json.dumps( + { + "customer_behavior": behavior, + "rich_experiences": [ + { + "type": exp.component_type, + "title": exp.title, + "data": exp.data, + "actions": exp.actions, + "priority": exp.priority, + } + for exp in rich_experiences + ], + } + ) + + +def escalate_to_human(reason: str, urgency: str = "medium") -> str: + """ + Escalate the conversation to a human agent + + Args: + reason: Why the escalation is needed + urgency: Priority level (low, medium, high) + + Returns: + JSON string with escalation details + """ + langwatch.get_current_trace().update(metadata={"labels": ["escalation"]}) + + escalation_data = { + "escalated": True, + "reason": reason, + "urgency": urgency, + "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes", + "message": "I'm connecting you with a specialist who can provide additional assistance.", + } + return json.dumps(escalation_data) + + +# Create the main support agent +support_agent = Agent( + name="BankCustomerSupportAgent", + model=Nebius( + id="deepseek-ai/DeepSeek-V3.2", + api_key=os.getenv("NEBIUS_API_KEY"), + ), + tools=[ + get_conversation_summary, + get_message_suggestion, + explore_customer_account, + escalate_to_human, + ], + description=SYSTEM_PROMPT, + add_history_to_context=True, # Let Agno handle memory +) + + +# Simple interface for testing +def chat_with_agent(message: str) -> str: + """Simple interface to chat with the agent""" + response = support_agent.run(message) + return response.content + + +# Example usage +if __name__ == "__main__": + print("=== Bank Customer Support Agent ===") + print( + "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?" + ) + + # Simulate a conversation + customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud." + print(f"\nCustomer: {customer_message}") + + response = chat_with_agent(customer_message) + print(f"Agent: {response}") + + # Continue conversation + customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?" + print(f"\nCustomer: {customer_message2}") + + response2 = chat_with_agent(customer_message2) + print(f"Agent: {response2}") + + print("\n=== Conversation Complete ===") diff --git a/main_support_agent_glm.py b/main_support_agent_glm.py new file mode 100644 index 0000000..2440441 --- /dev/null +++ b/main_support_agent_glm.py @@ -0,0 +1,229 @@ +""" +Main Bank Customer Support Agent - GLM-4.7-FP8 Model + +This is the production code - kept very simple. One agent with tools, Agno handles memory. +""" + +import os +import json +from typing import Dict, Any +import dotenv +from agno.agent import Agent +from agno.models.nebius import Nebius + +import agent_config + +dotenv.load_dotenv() + +agent_config.set_model(Nebius(id="zai-org/GLM-4.7-FP8", api_key=os.getenv("NEBIUS_API_KEY"))) + +# Import our specialized agents as tools +from agents.summary_agent import summarize_conversation +from agents.next_message_agent import suggest_next_message +from agents.customer_explorer_agent import ( + explore_customer_context, + analyze_customer_behavior, +) + +import langwatch +from openinference.instrumentation.agno import AgnoInstrumentor + +langwatch.setup(instrumentors=[AgnoInstrumentor()]) + +SYSTEM_PROMPT = """ +You are a customer support agent for SecureBank, a modern digital banking platform. + +Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations: + +TOOL USAGE REQUIREMENTS: + +1. **explore_customer_account** - ALWAYS use when: + - Customer mentions fraud, unauthorized transactions, or security concerns + - Customer asks about spending patterns, budgeting, or financial analysis + - Customer needs account-specific insights or personalized recommendations + - Any urgent business account issues that need immediate investigation + +2. **get_message_suggestion** - ALWAYS use when: + - Customer has complex, multi-part problems (locked accounts + fees + missing deposits) + - You need guidance on complex banking regulations or procedures + - Customer issue involves multiple interconnected banking services + +3. **escalate_to_human** - ALWAYS use when: + - Customer explicitly demands to speak with a manager, supervisor, or human agent + - Customer expresses extreme frustration or dissatisfaction + - Business customer has urgent issues affecting operations (payroll, employee payments) + - Set urgency to "high" for business-critical issues + +4. **get_conversation_summary** - Use when: + - Customer asks you to summarize the conversation + - You need to analyze conversation patterns or sentiment + +CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly. + +Guidelines: +- Be helpful, professional, and empathetic +- Use tools proactively based on the requirements above +- Provide clear, actionable solutions +- Always prioritize customer security and privacy + +Remember: Tool usage is not optional when the situation matches the requirements above. +""" + + +def get_conversation_summary(conversation_context: str = "recent messages") -> str: + """ + Analyze the conversation for patterns, sentiment, and key issues + + Args: + conversation_context: Context about what to analyze + + Returns: + JSON string with conversation analysis + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_conversation_summary"]} + ) + # In a real implementation, this would get the actual conversation history + # For now, we'll simulate with a basic response + return json.dumps( + { + "summary": "Conversation analysis requested", + "sentiment": "neutral", + "key_issues": ["general inquiry"], + "suggested_actions": ["continue conversation"], + } + ) + + +def get_message_suggestion(customer_query: str, context: str = "") -> str: + """ + Get suggestions for responding to customer queries using knowledge base + + Args: + customer_query: The customer's question or concern + context: Additional context about the conversation + + Returns: + JSON string with response suggestions + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_message_suggestion"]} + ) + # Simulate knowledge base lookup + suggestion_data = { + "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.", + "confidence": "medium", + "knowledge_sources": ["general_banking_guide"], + "alternatives": ["Ask for more details", "Escalate to specialist"], + } + return json.dumps(suggestion_data) + + +def explore_customer_account(customer_id: str, query: str) -> str: + """ + Explore customer account data and provide rich insights + + Args: + customer_id: Customer identifier (e.g., CUST_001) + query: What to explore about the customer + + Returns: + JSON string with customer insights and rich experiences + """ + # Get customer behavior analysis + behavior = analyze_customer_behavior(customer_id) + + # Get rich experiences based on query + rich_experiences = explore_customer_context(customer_id, query) + + langwatch.get_current_trace().update( + metadata={"labels": ["tool_explore_customer_account"]} + ) + + return json.dumps( + { + "customer_behavior": behavior, + "rich_experiences": [ + { + "type": exp.component_type, + "title": exp.title, + "data": exp.data, + "actions": exp.actions, + "priority": exp.priority, + } + for exp in rich_experiences + ], + } + ) + + +def escalate_to_human(reason: str, urgency: str = "medium") -> str: + """ + Escalate the conversation to a human agent + + Args: + reason: Why the escalation is needed + urgency: Priority level (low, medium, high) + + Returns: + JSON string with escalation details + """ + langwatch.get_current_trace().update(metadata={"labels": ["escalation"]}) + + escalation_data = { + "escalated": True, + "reason": reason, + "urgency": urgency, + "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes", + "message": "I'm connecting you with a specialist who can provide additional assistance.", + } + return json.dumps(escalation_data) + + +# Create the main support agent +support_agent = Agent( + name="BankCustomerSupportAgent", + model=Nebius( + id="zai-org/GLM-4.7-FP8", + api_key=os.getenv("NEBIUS_API_KEY"), + ), + tools=[ + get_conversation_summary, + get_message_suggestion, + explore_customer_account, + escalate_to_human, + ], + description=SYSTEM_PROMPT, + add_history_to_context=True, # Let Agno handle memory +) + + +# Simple interface for testing +def chat_with_agent(message: str) -> str: + """Simple interface to chat with the agent""" + response = support_agent.run(message) + return response.content + + +# Example usage +if __name__ == "__main__": + print("=== Bank Customer Support Agent ===") + print( + "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?" + ) + + # Simulate a conversation + customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud." + print(f"\nCustomer: {customer_message}") + + response = chat_with_agent(customer_message) + print(f"Agent: {response}") + + # Continue conversation + customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?" + print(f"\nCustomer: {customer_message2}") + + response2 = chat_with_agent(customer_message2) + print(f"Agent: {response2}") + + print("\n=== Conversation Complete ===") diff --git a/main_support_agent_minimax.py b/main_support_agent_minimax.py new file mode 100644 index 0000000..55a05fe --- /dev/null +++ b/main_support_agent_minimax.py @@ -0,0 +1,229 @@ +""" +Main Bank Customer Support Agent - MiniMax-M2.1 Model + +This is the production code - kept very simple. One agent with tools, Agno handles memory. +""" + +import os +import json +from typing import Dict, Any +import dotenv +from agno.agent import Agent +from agno.models.nebius import Nebius + +import agent_config + +dotenv.load_dotenv() + +agent_config.set_model(Nebius(id="MiniMaxAI/MiniMax-M2.1", api_key=os.getenv("NEBIUS_API_KEY"))) + +# Import our specialized agents as tools +from agents.summary_agent import summarize_conversation +from agents.next_message_agent import suggest_next_message +from agents.customer_explorer_agent import ( + explore_customer_context, + analyze_customer_behavior, +) + +import langwatch +from openinference.instrumentation.agno import AgnoInstrumentor + +langwatch.setup(instrumentors=[AgnoInstrumentor()]) + +SYSTEM_PROMPT = """ +You are a customer support agent for SecureBank, a modern digital banking platform. + +Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations: + +TOOL USAGE REQUIREMENTS: + +1. **explore_customer_account** - ALWAYS use when: + - Customer mentions fraud, unauthorized transactions, or security concerns + - Customer asks about spending patterns, budgeting, or financial analysis + - Customer needs account-specific insights or personalized recommendations + - Any urgent business account issues that need immediate investigation + +2. **get_message_suggestion** - ALWAYS use when: + - Customer has complex, multi-part problems (locked accounts + fees + missing deposits) + - You need guidance on complex banking regulations or procedures + - Customer issue involves multiple interconnected banking services + +3. **escalate_to_human** - ALWAYS use when: + - Customer explicitly demands to speak with a manager, supervisor, or human agent + - Customer expresses extreme frustration or dissatisfaction + - Business customer has urgent issues affecting operations (payroll, employee payments) + - Set urgency to "high" for business-critical issues + +4. **get_conversation_summary** - Use when: + - Customer asks you to summarize the conversation + - You need to analyze conversation patterns or sentiment + +CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly. + +Guidelines: +- Be helpful, professional, and empathetic +- Use tools proactively based on the requirements above +- Provide clear, actionable solutions +- Always prioritize customer security and privacy + +Remember: Tool usage is not optional when the situation matches the requirements above. +""" + + +def get_conversation_summary(conversation_context: str = "recent messages") -> str: + """ + Analyze the conversation for patterns, sentiment, and key issues + + Args: + conversation_context: Context about what to analyze + + Returns: + JSON string with conversation analysis + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_conversation_summary"]} + ) + # In a real implementation, this would get the actual conversation history + # For now, we'll simulate with a basic response + return json.dumps( + { + "summary": "Conversation analysis requested", + "sentiment": "neutral", + "key_issues": ["general inquiry"], + "suggested_actions": ["continue conversation"], + } + ) + + +def get_message_suggestion(customer_query: str, context: str = "") -> str: + """ + Get suggestions for responding to customer queries using knowledge base + + Args: + customer_query: The customer's question or concern + context: Additional context about the conversation + + Returns: + JSON string with response suggestions + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_message_suggestion"]} + ) + # Simulate knowledge base lookup + suggestion_data = { + "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.", + "confidence": "medium", + "knowledge_sources": ["general_banking_guide"], + "alternatives": ["Ask for more details", "Escalate to specialist"], + } + return json.dumps(suggestion_data) + + +def explore_customer_account(customer_id: str, query: str) -> str: + """ + Explore customer account data and provide rich insights + + Args: + customer_id: Customer identifier (e.g., CUST_001) + query: What to explore about the customer + + Returns: + JSON string with customer insights and rich experiences + """ + # Get customer behavior analysis + behavior = analyze_customer_behavior(customer_id) + + # Get rich experiences based on query + rich_experiences = explore_customer_context(customer_id, query) + + langwatch.get_current_trace().update( + metadata={"labels": ["tool_explore_customer_account"]} + ) + + return json.dumps( + { + "customer_behavior": behavior, + "rich_experiences": [ + { + "type": exp.component_type, + "title": exp.title, + "data": exp.data, + "actions": exp.actions, + "priority": exp.priority, + } + for exp in rich_experiences + ], + } + ) + + +def escalate_to_human(reason: str, urgency: str = "medium") -> str: + """ + Escalate the conversation to a human agent + + Args: + reason: Why the escalation is needed + urgency: Priority level (low, medium, high) + + Returns: + JSON string with escalation details + """ + langwatch.get_current_trace().update(metadata={"labels": ["escalation"]}) + + escalation_data = { + "escalated": True, + "reason": reason, + "urgency": urgency, + "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes", + "message": "I'm connecting you with a specialist who can provide additional assistance.", + } + return json.dumps(escalation_data) + + +# Create the main support agent +support_agent = Agent( + name="BankCustomerSupportAgent", + model=Nebius( + id="MiniMaxAI/MiniMax-M2.1", + api_key=os.getenv("NEBIUS_API_KEY"), + ), + tools=[ + get_conversation_summary, + get_message_suggestion, + explore_customer_account, + escalate_to_human, + ], + description=SYSTEM_PROMPT, + add_history_to_context=True, # Let Agno handle memory +) + + +# Simple interface for testing +def chat_with_agent(message: str) -> str: + """Simple interface to chat with the agent""" + response = support_agent.run(message) + return response.content + + +# Example usage +if __name__ == "__main__": + print("=== Bank Customer Support Agent ===") + print( + "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?" + ) + + # Simulate a conversation + customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud." + print(f"\nCustomer: {customer_message}") + + response = chat_with_agent(customer_message) + print(f"Agent: {response}") + + # Continue conversation + customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?" + print(f"\nCustomer: {customer_message2}") + + response2 = chat_with_agent(customer_message2) + print(f"Agent: {response2}") + + print("\n=== Conversation Complete ===") diff --git a/main_support_agent_openai.py b/main_support_agent_openai.py new file mode 100644 index 0000000..259167c --- /dev/null +++ b/main_support_agent_openai.py @@ -0,0 +1,229 @@ +""" +Main Bank Customer Support Agent - OpenAI GPT-OSS-120B Model + +This is the production code - kept very simple. One agent with tools, Agno handles memory. +""" + +import os +import json +from typing import Dict, Any +import dotenv +from agno.agent import Agent +from agno.models.nebius import Nebius + +import agent_config + +dotenv.load_dotenv() + +agent_config.set_model(Nebius(id="openai/gpt-oss-120b", api_key=os.getenv("NEBIUS_API_KEY"))) + +# Import our specialized agents as tools +from agents.summary_agent import summarize_conversation +from agents.next_message_agent import suggest_next_message +from agents.customer_explorer_agent import ( + explore_customer_context, + analyze_customer_behavior, +) + +import langwatch +from openinference.instrumentation.agno import AgnoInstrumentor + +langwatch.setup(instrumentors=[AgnoInstrumentor()]) + +SYSTEM_PROMPT = """ +You are a customer support agent for SecureBank, a modern digital banking platform. + +Your role is to help customers with their banking needs professionally and efficiently. You have access to specialized tools that MUST be used in specific situations: + +TOOL USAGE REQUIREMENTS: + +1. **explore_customer_account** - ALWAYS use when: + - Customer mentions fraud, unauthorized transactions, or security concerns + - Customer asks about spending patterns, budgeting, or financial analysis + - Customer needs account-specific insights or personalized recommendations + - Any urgent business account issues that need immediate investigation + +2. **get_message_suggestion** - ALWAYS use when: + - Customer has complex, multi-part problems (locked accounts + fees + missing deposits) + - You need guidance on complex banking regulations or procedures + - Customer issue involves multiple interconnected banking services + +3. **escalate_to_human** - ALWAYS use when: + - Customer explicitly demands to speak with a manager, supervisor, or human agent + - Customer expresses extreme frustration or dissatisfaction + - Business customer has urgent issues affecting operations (payroll, employee payments) + - Set urgency to "high" for business-critical issues + +4. **get_conversation_summary** - Use when: + - Customer asks you to summarize the conversation + - You need to analyze conversation patterns or sentiment + +CRITICAL: For simple questions like service hours, do NOT use unnecessary tools. Respond directly. + +Guidelines: +- Be helpful, professional, and empathetic +- Use tools proactively based on the requirements above +- Provide clear, actionable solutions +- Always prioritize customer security and privacy + +Remember: Tool usage is not optional when the situation matches the requirements above. +""" + + +def get_conversation_summary(conversation_context: str = "recent messages") -> str: + """ + Analyze the conversation for patterns, sentiment, and key issues + + Args: + conversation_context: Context about what to analyze + + Returns: + JSON string with conversation analysis + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_conversation_summary"]} + ) + # In a real implementation, this would get the actual conversation history + # For now, we'll simulate with a basic response + return json.dumps( + { + "summary": "Conversation analysis requested", + "sentiment": "neutral", + "key_issues": ["general inquiry"], + "suggested_actions": ["continue conversation"], + } + ) + + +def get_message_suggestion(customer_query: str, context: str = "") -> str: + """ + Get suggestions for responding to customer queries using knowledge base + + Args: + customer_query: The customer's question or concern + context: Additional context about the conversation + + Returns: + JSON string with response suggestions + """ + langwatch.get_current_trace().update( + metadata={"labels": ["tool_get_message_suggestion"]} + ) + # Simulate knowledge base lookup + suggestion_data = { + "suggested_response": f"I understand your concern about: {customer_query}. Let me help you with that.", + "confidence": "medium", + "knowledge_sources": ["general_banking_guide"], + "alternatives": ["Ask for more details", "Escalate to specialist"], + } + return json.dumps(suggestion_data) + + +def explore_customer_account(customer_id: str, query: str) -> str: + """ + Explore customer account data and provide rich insights + + Args: + customer_id: Customer identifier (e.g., CUST_001) + query: What to explore about the customer + + Returns: + JSON string with customer insights and rich experiences + """ + # Get customer behavior analysis + behavior = analyze_customer_behavior(customer_id) + + # Get rich experiences based on query + rich_experiences = explore_customer_context(customer_id, query) + + langwatch.get_current_trace().update( + metadata={"labels": ["tool_explore_customer_account"]} + ) + + return json.dumps( + { + "customer_behavior": behavior, + "rich_experiences": [ + { + "type": exp.component_type, + "title": exp.title, + "data": exp.data, + "actions": exp.actions, + "priority": exp.priority, + } + for exp in rich_experiences + ], + } + ) + + +def escalate_to_human(reason: str, urgency: str = "medium") -> str: + """ + Escalate the conversation to a human agent + + Args: + reason: Why the escalation is needed + urgency: Priority level (low, medium, high) + + Returns: + JSON string with escalation details + """ + langwatch.get_current_trace().update(metadata={"labels": ["escalation"]}) + + escalation_data = { + "escalated": True, + "reason": reason, + "urgency": urgency, + "estimated_wait": "5-10 minutes" if urgency == "high" else "10-15 minutes", + "message": "I'm connecting you with a specialist who can provide additional assistance.", + } + return json.dumps(escalation_data) + + +# Create the main support agent +support_agent = Agent( + name="BankCustomerSupportAgent", + model=Nebius( + id="openai/gpt-oss-120b", + api_key=os.getenv("NEBIUS_API_KEY"), + ), + tools=[ + get_conversation_summary, + get_message_suggestion, + explore_customer_account, + escalate_to_human, + ], + description=SYSTEM_PROMPT, + add_history_to_context=True, # Let Agno handle memory +) + + +# Simple interface for testing +def chat_with_agent(message: str) -> str: + """Simple interface to chat with the agent""" + response = support_agent.run(message) + return response.content + + +# Example usage +if __name__ == "__main__": + print("=== Bank Customer Support Agent ===") + print( + "Agent: Hello! I'm here to help with your banking needs. How can I assist you today?" + ) + + # Simulate a conversation + customer_message = "Hi, I'm seeing some transactions on my account that I don't recognize. I'm worried about fraud." + print(f"\nCustomer: {customer_message}") + + response = chat_with_agent(customer_message) + print(f"Agent: {response}") + + # Continue conversation + customer_message2 = "Yes, there's an $85 charge from Amazon and a $45 gas station charge. Can you help me freeze my card?" + print(f"\nCustomer: {customer_message2}") + + response2 = chat_with_agent(customer_message2) + print(f"Agent: {response2}") + + print("\n=== Conversation Complete ===") diff --git a/tests-demo/__init__.py b/tests-demo/__init__.py new file mode 100644 index 0000000..26bd4a3 --- /dev/null +++ b/tests-demo/__init__.py @@ -0,0 +1,3 @@ +""" +Demo tests for bank customer support agent across different models +""" diff --git a/tests-demo/test_demo_claude.py b/tests-demo/test_demo_claude.py new file mode 100644 index 0000000..d21b33c --- /dev/null +++ b/tests-demo/test_demo_claude.py @@ -0,0 +1,427 @@ +""" +Tests for the main bank customer support agent - Claude Sonnet 4.5 Model + +These tests cover real business scenarios and validate tool calling behavior +using Claude Sonnet 4.5 model for evaluation. +""" +import asyncio +import pytest +import json +import sys +import os +import dotenv + +# Add parent directory to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import scenario +from main_support_agent_claude import support_agent + +dotenv.load_dotenv() +scenario.configure(default_model="openai/gpt-4o") + + +def _parse_tool_arguments(tool_call: dict) -> dict: + raw_args = tool_call["function"].get("arguments", {}) + if isinstance(raw_args, dict): + return raw_args + if isinstance(raw_args, str): + try: + return json.loads(raw_args) + except json.JSONDecodeError: + return {} + return {} + + +def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None: + assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}" + + +def _build_tool_trace_messages(response) -> list[dict]: + messages: list[dict] = [] + for i, tool in enumerate(response.tools or []): + tool_call_id = tool.tool_call_id or f"tool_call_{i}" + tool_name = tool.tool_name or "unknown_tool" + tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {} + tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {}) + + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": json.dumps(tool_args), + }, + } + ], + } + ) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": tool_result, + } + ) + + if isinstance(response.content, str) and response.content: + messages.append({"role": "assistant", "content": response.content}) + + return messages + + +class BankSupportAgentAdapter(scenario.AgentAdapter): + """Adapter for our main bank support agent""" + + async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes: + message_content = input.last_new_user_message_str() + response = support_agent.run(message_content) + + # Use synthetic tool trace messages — these properly pair tool calls + # with their results, avoiding missing tool_call_id issues with + # Claude's toolu_* IDs + synthetic_messages = _build_tool_trace_messages(response) + if synthetic_messages: + return synthetic_messages + + # Fallback to content if no tool calls + return response.content # type: ignore + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_fraud_investigation_workflow(): + result = await scenario.run( + name="fraud investigation and card security - Claude", + description=""" + Customer discovers unauthorized transactions on their account and is worried about fraud. + They need immediate help to secure their account and investigate the suspicious activity. + This tests whether the agent responds with appropriate urgency and offers concrete security actions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent takes fraud concerns seriously and responds with urgency", + "Agent gathers necessary information (account details) to investigate", + "Agent offers concrete security actions like card freezing or blocking", + "Agent provides clear next steps for fraud investigation and dispute process", + "Agent maintains professional and reassuring tone throughout", + "Agent does not re-ask for customer ID that was already provided", + ] + ), + ], + script=[ + scenario.user( + "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!" + ), + scenario.agent(), + scenario.user( + "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases." + ), + scenario.agent(), + scenario.user( + "Yes, please help me secure my account right away. I'm really worried about more charges appearing." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Fraud investigation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_escalation_workflow(): + def check_escalation_called(state: scenario.ScenarioState): + """Verify agent escalates when customer explicitly demands human help""" + assert state.has_tool_call( + "escalate_to_human" + ), "Agent should escalate when customer demands manager/human help" + + tool_call = state.last_tool_call("escalate_to_human") + if tool_call: + args = _parse_tool_arguments(tool_call) + reason = args.get("reason", "").lower() + assert any( + keyword in reason + for keyword in ["frustrated", "manager", "human", "escalation"] + ), "Escalation reason should reflect customer's frustration and demand" + + result = await scenario.run( + name="customer escalation to human agent - Claude", + description=""" + Customer has been dealing with an ongoing issue and is frustrated. + They explicitly demand to speak with a human agent or manager. + The agent should handle this professionally and escalate appropriately. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges customer's frustration empathetically", + "Agent offers to escalate when requested", + "Agent provides escalation timeline and process information", + "Agent maintains professionalism despite customer frustration", + ] + ), + ], + script=[ + scenario.user( + "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!" + ), + scenario.agent(), + scenario.user( + "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service." + ), + scenario.agent(), + check_escalation_called, + scenario.judge(), + ], + ) + + _assert_success(result, "Escalation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_complex_issue_triggers_knowledge_base(): + result = await scenario.run( + name="complex multi-issue banking problem - Claude", + description=""" + Customer has multiple interconnected banking problems: locked online banking, + unexpected fees, and missing direct deposit. They need systematic help. + This tests whether the agent can handle multiple issues comprehensively. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)", + "Agent provides systematic approach with clear steps for each issue", + "Agent shows empathy for customer's stress and urgency", + "Agent prioritizes the most urgent issue (locked account for bill payments)", + "Agent offers concrete next steps that the customer can act on", + ] + ), + ], + script=[ + scenario.user( + "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Complex issue test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_urgent_business_scenario(): + result = await scenario.run( + name="urgent business account problem - Claude", + description=""" + Business customer has an urgent issue affecting their operations. + They can't access funds to pay employees. This tests whether the agent + recognizes urgency and takes appropriate high-priority action. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent immediately recognizes the business urgency and employee impact", + "Agent responds with high priority and urgency in tone", + "Agent takes concrete action (investigating the freeze or escalating to specialists)", + "Agent provides realistic timeline or sets expectations appropriately", + "Agent offers interim solutions or workarounds if available", + ] + ), + ], + script=[ + scenario.user( + "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001." + ), + scenario.agent(), + scenario.user( + "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Urgent business test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_simple_inquiry_no_tools(): + result = await scenario.run( + name="simple inquiry without tool usage - Claude", + description=""" + Customer asks a simple question about branch hours or general banking info. + The agent should answer directly without invoking any tools. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent answers the simple question directly and helpfully", + "Agent does not over-complicate the response", + "Agent maintains a friendly and professional tone", + ] + ), + ], + script=[ + scenario.user( + "What are your customer support hours? I just want to know when I can call if I have an issue." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Simple inquiry test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_spending_analysis_request(): + result = await scenario.run( + name="spending analysis and budgeting help - Claude", + description=""" + Customer wants to understand their spending patterns and get budgeting advice. + The agent should use explore_customer_account to analyze their transactions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent uses account exploration tools to analyze spending", + "Agent provides specific insights about spending categories", + "Agent offers actionable budgeting advice or recommendations", + "Agent is helpful and non-judgmental about spending habits", + ] + ), + ], + script=[ + scenario.user( + "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?" + ), + scenario.agent(), + scenario.user( + "That's really helpful. Are there any areas where you think I could cut back?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Spending analysis test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_lost_card_replacement(): + result = await scenario.run( + name="lost card replacement workflow - Claude", + description=""" + Customer has lost their debit card and needs a replacement. + Tests whether the agent handles the card replacement process properly + including immediate security measures. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent treats lost card with appropriate urgency", + "Agent suggests freezing or blocking the lost card immediately", + "Agent explains the replacement card process and timeline", + "Agent asks about any unauthorized transactions since the card was lost", + "Agent reassures the customer about account security", + ] + ), + ], + script=[ + scenario.user( + "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Lost card replacement test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_overdraft_fee_dispute(): + result = await scenario.run( + name="overdraft fee dispute and resolution - Claude", + description=""" + Customer with a basic checking account notices an overdraft fee and wants + it reversed. Tests empathy, account investigation, and fee resolution. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent shows empathy for the customer's frustration about the fee", + "Agent investigates the account to understand the overdraft situation", + "Agent explains how the overdraft fee occurred", + "Agent offers a resolution path (fee waiver, escalation, or explanation)", + "Agent suggests ways to avoid future overdraft fees", + ] + ), + ], + script=[ + scenario.user( + "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002." + ), + scenario.agent(), + scenario.user( + "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Overdraft fee dispute test") + + +if __name__ == "__main__": + asyncio.run(test_fraud_investigation_workflow()) diff --git a/tests-demo/test_demo_deepseek.py b/tests-demo/test_demo_deepseek.py new file mode 100644 index 0000000..6133a40 --- /dev/null +++ b/tests-demo/test_demo_deepseek.py @@ -0,0 +1,451 @@ +""" +Tests for the main bank customer support agent - DeepSeek Model + +These tests cover real business scenarios and validate tool calling behavior +using Nebius DeepSeek-V3.2 model for evaluation. +""" +import asyncio +import pytest +import json +import sys +import os +import dotenv + +# Add parent directory to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import scenario +from main_support_agent_deepseek import support_agent + +dotenv.load_dotenv() +scenario.configure(default_model="openai/gpt-4o") + + +def _parse_tool_arguments(tool_call: dict) -> dict: + raw_args = tool_call["function"].get("arguments", {}) + if isinstance(raw_args, dict): + return raw_args + if isinstance(raw_args, str): + try: + return json.loads(raw_args) + except json.JSONDecodeError: + return {} + return {} + + +def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None: + assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}" + + +def _build_tool_trace_messages(response) -> list[dict]: + messages: list[dict] = [] + for i, tool in enumerate(response.tools or []): + tool_call_id = tool.tool_call_id or f"tool_call_{i}" + tool_name = tool.tool_name or "unknown_tool" + tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {} + tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {}) + + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": json.dumps(tool_args), + }, + } + ], + } + ) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": tool_result, + } + ) + + if isinstance(response.content, str) and response.content: + messages.append({"role": "assistant", "content": response.content}) + + return messages + + +class BankSupportAgentAdapter(scenario.AgentAdapter): + """Adapter for our main bank support agent""" + + async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes: + message_content = input.last_new_user_message_str() + response = support_agent.run(message_content) + + # Convert Agno messages to OpenAI format for Scenario + openai_messages = [] + for message in response.messages or []: + if message.role in ["assistant", "user", "system", "tool"]: + msg_dict = {"role": message.role, "content": message.content} + + # Add tool calls if present (for assistant messages) + if message.tool_calls: + msg_dict["tool_calls"] = message.tool_calls + + # Add tool call ID if present (for tool messages) + if hasattr(message, "tool_call_id") and message.tool_call_id: + msg_dict["tool_call_id"] = message.tool_call_id + + openai_messages.append(msg_dict) + + # Return all messages except system and user (Scenario manages the conversation flow) + # We need to include tool messages to satisfy OpenAI's requirements + relevant_messages = [ + msg for msg in openai_messages if msg["role"] in ["assistant", "tool"] + ] + + if relevant_messages: + has_tool_calls = any( + msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages + ) + if has_tool_calls: + return relevant_messages + + synthetic_messages = _build_tool_trace_messages(response) + if synthetic_messages: + return synthetic_messages + + # Fallback to content if no relevant messages found + return response.content # type: ignore + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_fraud_investigation_workflow(): + result = await scenario.run( + name="fraud investigation and card security - DeepSeek", + description=""" + Customer discovers unauthorized transactions on their account and is worried about fraud. + They need immediate help to secure their account and investigate the suspicious activity. + This tests whether the agent responds with appropriate urgency and offers concrete security actions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent takes fraud concerns seriously and responds with urgency", + "Agent gathers necessary information (account details) to investigate", + "Agent offers concrete security actions like card freezing or blocking", + "Agent provides clear next steps for fraud investigation and dispute process", + "Agent maintains professional and reassuring tone throughout", + "Agent does not re-ask for customer ID that was already provided", + ] + ), + ], + script=[ + scenario.user( + "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!" + ), + scenario.agent(), + scenario.user( + "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases." + ), + scenario.agent(), + scenario.user( + "Yes, please help me secure my account right away. I'm really worried about more charges appearing." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Fraud investigation test") + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_escalation_workflow(): + def check_escalation_called(state: scenario.ScenarioState): + """Verify agent escalates when customer explicitly demands human help""" + assert state.has_tool_call( + "escalate_to_human" + ), "Agent should escalate when customer demands manager/human help" + + tool_call = state.last_tool_call("escalate_to_human") + if tool_call: + args = _parse_tool_arguments(tool_call) + reason = args.get("reason", "").lower() + assert any( + keyword in reason + for keyword in ["frustrated", "manager", "human", "escalation"] + ), "Escalation reason should reflect customer's frustration and demand" + + result = await scenario.run( + name="customer escalation to human agent - DeepSeek", + description=""" + Customer has been dealing with an ongoing issue and is frustrated. + They explicitly demand to speak with a human agent or manager. + The agent should handle this professionally and escalate appropriately. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges customer's frustration empathetically", + "Agent offers to escalate when requested", + "Agent provides escalation timeline and process information", + "Agent maintains professionalism despite customer frustration", + ] + ), + ], + script=[ + scenario.user( + "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!" + ), + scenario.agent(), + scenario.user( + "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service." + ), + scenario.agent(), + check_escalation_called, + scenario.judge(), + ], + ) + + _assert_success(result, "Escalation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_complex_issue_triggers_knowledge_base(): + result = await scenario.run( + name="complex multi-issue banking problem - DeepSeek", + description=""" + Customer has multiple interconnected banking problems: locked online banking, + unexpected fees, and missing direct deposit. They need systematic help. + This tests whether the agent can handle multiple issues comprehensively. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)", + "Agent provides systematic approach with clear steps for each issue", + "Agent shows empathy for customer's stress and urgency", + "Agent prioritizes the most urgent issue (locked account for bill payments)", + "Agent offers concrete next steps that the customer can act on", + ] + ), + ], + script=[ + scenario.user( + "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Complex issue test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_urgent_business_scenario(): + result = await scenario.run( + name="urgent business account problem - DeepSeek", + description=""" + Business customer has an urgent issue affecting their operations. + They can't access funds to pay employees. This tests whether the agent + recognizes urgency and takes appropriate high-priority action. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent immediately recognizes the business urgency and employee impact", + "Agent responds with high priority and urgency in tone", + "Agent takes concrete action (investigating the freeze or escalating to specialists)", + "Agent provides realistic timeline or sets expectations appropriately", + "Agent offers interim solutions or workarounds if available", + ] + ), + ], + script=[ + scenario.user( + "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001." + ), + scenario.agent(), + scenario.user( + "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Urgent business test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_simple_inquiry_no_tools(): + result = await scenario.run( + name="simple inquiry without tool usage - DeepSeek", + description=""" + Customer asks a simple question about branch hours or general banking info. + The agent should answer directly without invoking any tools. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent answers the simple question directly and helpfully", + "Agent does not over-complicate the response", + "Agent maintains a friendly and professional tone", + ] + ), + ], + script=[ + scenario.user( + "What are your customer support hours? I just want to know when I can call if I have an issue." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Simple inquiry test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_spending_analysis_request(): + result = await scenario.run( + name="spending analysis and budgeting help - DeepSeek", + description=""" + Customer wants to understand their spending patterns and get budgeting advice. + The agent should use explore_customer_account to analyze their transactions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent uses account exploration tools to analyze spending", + "Agent provides specific insights about spending categories", + "Agent offers actionable budgeting advice or recommendations", + "Agent is helpful and non-judgmental about spending habits", + ] + ), + ], + script=[ + scenario.user( + "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?" + ), + scenario.agent(), + scenario.user( + "That's really helpful. Are there any areas where you think I could cut back?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Spending analysis test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_lost_card_replacement(): + result = await scenario.run( + name="lost card replacement workflow - DeepSeek", + description=""" + Customer has lost their debit card and needs a replacement. + Tests whether the agent handles the card replacement process properly + including immediate security measures. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent treats lost card with appropriate urgency", + "Agent suggests freezing or blocking the lost card immediately", + "Agent explains the replacement card process and timeline", + "Agent asks about any unauthorized transactions since the card was lost", + "Agent reassures the customer about account security", + ] + ), + ], + script=[ + scenario.user( + "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Lost card replacement test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_overdraft_fee_dispute(): + result = await scenario.run( + name="overdraft fee dispute and resolution - DeepSeek", + description=""" + Customer with a basic checking account notices an overdraft fee and wants + it reversed. Tests empathy, account investigation, and fee resolution. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent shows empathy for the customer's frustration about the fee", + "Agent investigates the account to understand the overdraft situation", + "Agent explains how the overdraft fee occurred", + "Agent offers a resolution path (fee waiver, escalation, or explanation)", + "Agent suggests ways to avoid future overdraft fees", + ] + ), + ], + script=[ + scenario.user( + "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002." + ), + scenario.agent(), + scenario.user( + "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Overdraft fee dispute test") + + +if __name__ == "__main__": + asyncio.run(test_fraud_investigation_workflow()) diff --git a/tests-demo/test_demo_glm.py b/tests-demo/test_demo_glm.py new file mode 100644 index 0000000..056e782 --- /dev/null +++ b/tests-demo/test_demo_glm.py @@ -0,0 +1,453 @@ +""" +Tests for the main bank customer support agent - GLM Model + +These tests cover real business scenarios and validate tool calling behavior +using Nebius GLM-4.7-FP8 model for evaluation. +""" +import asyncio +import pytest +import json +import sys +import os +import dotenv + +# Add parent directory to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import scenario +from main_support_agent_glm import support_agent + +dotenv.load_dotenv() +scenario.configure(default_model="openai/gpt-4o") + + +def _parse_tool_arguments(tool_call: dict) -> dict: + raw_args = tool_call["function"].get("arguments", {}) + if isinstance(raw_args, dict): + return raw_args + if isinstance(raw_args, str): + try: + return json.loads(raw_args) + except json.JSONDecodeError: + return {} + return {} + + +def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None: + assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}" + + +def _build_tool_trace_messages(response) -> list[dict]: + messages: list[dict] = [] + for i, tool in enumerate(response.tools or []): + tool_call_id = tool.tool_call_id or f"tool_call_{i}" + tool_name = tool.tool_name or "unknown_tool" + tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {} + tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {}) + + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": json.dumps(tool_args), + }, + } + ], + } + ) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": tool_result, + } + ) + + if isinstance(response.content, str) and response.content: + messages.append({"role": "assistant", "content": response.content}) + + return messages + + +class BankSupportAgentAdapter(scenario.AgentAdapter): + """Adapter for our main bank support agent""" + + async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes: + message_content = input.last_new_user_message_str() + response = support_agent.run(message_content) + + # Convert Agno messages to OpenAI format for Scenario + openai_messages = [] + for message in response.messages or []: + if message.role in ["assistant", "user", "system", "tool"]: + msg_dict = {"role": message.role, "content": message.content} + + # Add tool calls if present (for assistant messages) + if message.tool_calls: + msg_dict["tool_calls"] = message.tool_calls + + # Add tool call ID if present (for tool messages) + if hasattr(message, "tool_call_id") and message.tool_call_id: + msg_dict["tool_call_id"] = message.tool_call_id + + openai_messages.append(msg_dict) + + # Return all messages except system and user (Scenario manages the conversation flow) + # We need to include tool messages to satisfy OpenAI's requirements + relevant_messages = [ + msg for msg in openai_messages if msg["role"] in ["assistant", "tool"] + ] + + if relevant_messages: + has_tool_calls = any( + msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages + ) + if has_tool_calls: + return relevant_messages + + synthetic_messages = _build_tool_trace_messages(response) + if synthetic_messages: + return synthetic_messages + + # Fallback to content if no relevant messages found + return response.content # type: ignore + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_fraud_investigation_workflow(): + result = await scenario.run( + name="fraud investigation and card security - GLM", + description=""" + Customer discovers unauthorized transactions on their account and is worried about fraud. + They need immediate help to secure their account and investigate the suspicious activity. + This tests whether the agent responds with appropriate urgency and offers concrete security actions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent takes fraud concerns seriously and responds with urgency", + "Agent gathers necessary information (account details) to investigate", + "Agent offers concrete security actions like card freezing or blocking", + "Agent provides clear next steps for fraud investigation and dispute process", + "Agent maintains professional and reassuring tone throughout", + "Agent does not re-ask for customer ID that was already provided", + ] + ), + ], + script=[ + scenario.user( + "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!" + ), + scenario.agent(), + scenario.user( + "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases." + ), + scenario.agent(), + scenario.user( + "Yes, please help me secure my account right away. I'm really worried about more charges appearing." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Fraud investigation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_escalation_workflow(): + def check_escalation_called(state: scenario.ScenarioState): + """Verify agent escalates when customer explicitly demands human help""" + assert state.has_tool_call( + "escalate_to_human" + ), "Agent should escalate when customer demands manager/human help" + + tool_call = state.last_tool_call("escalate_to_human") + if tool_call: + args = _parse_tool_arguments(tool_call) + reason = args.get("reason", "").lower() + assert any( + keyword in reason + for keyword in ["frustrated", "manager", "human", "escalation"] + ), "Escalation reason should reflect customer's frustration and demand" + + result = await scenario.run( + name="customer escalation to human agent - GLM", + description=""" + Customer has been dealing with an ongoing issue and is frustrated. + They explicitly demand to speak with a human agent or manager. + The agent should handle this professionally and escalate appropriately. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges customer's frustration empathetically", + "Agent offers to escalate when requested", + "Agent provides escalation timeline and process information", + "Agent maintains professionalism despite customer frustration", + ] + ), + ], + script=[ + scenario.user( + "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!" + ), + scenario.agent(), + scenario.user( + "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service." + ), + scenario.agent(), + check_escalation_called, + scenario.judge(), + ], + ) + + _assert_success(result, "Escalation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_complex_issue_triggers_knowledge_base(): + result = await scenario.run( + name="complex multi-issue banking problem - GLM", + description=""" + Customer has multiple interconnected banking problems: locked online banking, + unexpected fees, and missing direct deposit. They need systematic help. + This tests whether the agent can handle multiple issues comprehensively. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)", + "Agent provides systematic approach with clear steps for each issue", + "Agent shows empathy for customer's stress and urgency", + "Agent prioritizes the most urgent issue (locked account for bill payments)", + "Agent offers concrete next steps that the customer can act on", + ] + ), + ], + script=[ + scenario.user( + "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Complex issue test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_urgent_business_scenario(): + result = await scenario.run( + name="urgent business account problem - GLM", + description=""" + Business customer has an urgent issue affecting their operations. + They can't access funds to pay employees. This tests whether the agent + recognizes urgency and takes appropriate high-priority action. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent immediately recognizes the business urgency and employee impact", + "Agent responds with high priority and urgency in tone", + "Agent takes concrete action (investigating the freeze or escalating to specialists)", + "Agent provides realistic timeline or sets expectations appropriately", + "Agent offers interim solutions or workarounds if available", + ] + ), + ], + script=[ + scenario.user( + "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001." + ), + scenario.agent(), + scenario.user( + "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Urgent business test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_simple_inquiry_no_tools(): + result = await scenario.run( + name="simple inquiry without tool usage - GLM", + description=""" + Customer asks a simple question about branch hours or general banking info. + The agent should answer directly without invoking any tools. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent answers the simple question directly and helpfully", + "Agent does not over-complicate the response", + "Agent maintains a friendly and professional tone", + ] + ), + ], + script=[ + scenario.user( + "What are your customer support hours? I just want to know when I can call if I have an issue." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Simple inquiry test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_spending_analysis_request(): + result = await scenario.run( + name="spending analysis and budgeting help - GLM", + description=""" + Customer wants to understand their spending patterns and get budgeting advice. + The agent should use explore_customer_account to analyze their transactions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent uses account exploration tools to analyze spending", + "Agent provides specific insights about spending categories", + "Agent offers actionable budgeting advice or recommendations", + "Agent is helpful and non-judgmental about spending habits", + ] + ), + ], + script=[ + scenario.user( + "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?" + ), + scenario.agent(), + scenario.user( + "That's really helpful. Are there any areas where you think I could cut back?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Spending analysis test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_lost_card_replacement(): + result = await scenario.run( + name="lost card replacement workflow - GLM", + description=""" + Customer has lost their debit card and needs a replacement. + Tests whether the agent handles the card replacement process properly + including immediate security measures. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent treats lost card with appropriate urgency", + "Agent suggests freezing or blocking the lost card immediately", + "Agent explains the replacement card process and timeline", + "Agent asks about any unauthorized transactions since the card was lost", + "Agent reassures the customer about account security", + ] + ), + ], + script=[ + scenario.user( + "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Lost card replacement test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_overdraft_fee_dispute(): + result = await scenario.run( + name="overdraft fee dispute and resolution - GLM", + description=""" + Customer with a basic checking account notices an overdraft fee and wants + it reversed. Tests empathy, account investigation, and fee resolution. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent shows empathy for the customer's frustration about the fee", + "Agent investigates the account to understand the overdraft situation", + "Agent explains how the overdraft fee occurred", + "Agent offers a resolution path (fee waiver, escalation, or explanation)", + "Agent suggests ways to avoid future overdraft fees", + ] + ), + ], + script=[ + scenario.user( + "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002." + ), + scenario.agent(), + scenario.user( + "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Overdraft fee dispute test") + + +if __name__ == "__main__": + asyncio.run(test_fraud_investigation_workflow()) diff --git a/tests-demo/test_demo_minimax.py b/tests-demo/test_demo_minimax.py new file mode 100644 index 0000000..f2d481e --- /dev/null +++ b/tests-demo/test_demo_minimax.py @@ -0,0 +1,453 @@ +""" +Tests for the main bank customer support agent - MiniMax Model + +These tests cover real business scenarios and validate tool calling behavior +using Nebius MiniMax-M2.1 model for evaluation. +""" +import asyncio +import pytest +import json +import sys +import os +import dotenv + +# Add parent directory to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import scenario +from main_support_agent_minimax import support_agent + +dotenv.load_dotenv() +scenario.configure(default_model="openai/gpt-4o") + + +def _parse_tool_arguments(tool_call: dict) -> dict: + raw_args = tool_call["function"].get("arguments", {}) + if isinstance(raw_args, dict): + return raw_args + if isinstance(raw_args, str): + try: + return json.loads(raw_args) + except json.JSONDecodeError: + return {} + return {} + + +def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None: + assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}" + + +def _build_tool_trace_messages(response) -> list[dict]: + messages: list[dict] = [] + for i, tool in enumerate(response.tools or []): + tool_call_id = tool.tool_call_id or f"tool_call_{i}" + tool_name = tool.tool_name or "unknown_tool" + tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {} + tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {}) + + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": json.dumps(tool_args), + }, + } + ], + } + ) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": tool_result, + } + ) + + if isinstance(response.content, str) and response.content: + messages.append({"role": "assistant", "content": response.content}) + + return messages + + +class BankSupportAgentAdapter(scenario.AgentAdapter): + """Adapter for our main bank support agent""" + + async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes: + message_content = input.last_new_user_message_str() + response = support_agent.run(message_content) + + # Convert Agno messages to OpenAI format for Scenario + openai_messages = [] + for message in response.messages or []: + if message.role in ["assistant", "user", "system", "tool"]: + msg_dict = {"role": message.role, "content": message.content} + + # Add tool calls if present (for assistant messages) + if message.tool_calls: + msg_dict["tool_calls"] = message.tool_calls + + # Add tool call ID if present (for tool messages) + if hasattr(message, "tool_call_id") and message.tool_call_id: + msg_dict["tool_call_id"] = message.tool_call_id + + openai_messages.append(msg_dict) + + # Return all messages except system and user (Scenario manages the conversation flow) + # We need to include tool messages to satisfy OpenAI's requirements + relevant_messages = [ + msg for msg in openai_messages if msg["role"] in ["assistant", "tool"] + ] + + if relevant_messages: + has_tool_calls = any( + msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages + ) + if has_tool_calls: + return relevant_messages + + synthetic_messages = _build_tool_trace_messages(response) + if synthetic_messages: + return synthetic_messages + + # Fallback to content if no relevant messages found + return response.content # type: ignore + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_fraud_investigation_workflow(): + result = await scenario.run( + name="fraud investigation and card security - MiniMax", + description=""" + Customer discovers unauthorized transactions on their account and is worried about fraud. + They need immediate help to secure their account and investigate the suspicious activity. + This tests whether the agent responds with appropriate urgency and offers concrete security actions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent takes fraud concerns seriously and responds with urgency", + "Agent gathers necessary information (account details) to investigate", + "Agent offers concrete security actions like card freezing or blocking", + "Agent provides clear next steps for fraud investigation and dispute process", + "Agent maintains professional and reassuring tone throughout", + "Agent does not re-ask for customer ID that was already provided", + ] + ), + ], + script=[ + scenario.user( + "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!" + ), + scenario.agent(), + scenario.user( + "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases." + ), + scenario.agent(), + scenario.user( + "Yes, please help me secure my account right away. I'm really worried about more charges appearing." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Fraud investigation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_escalation_workflow(): + def check_escalation_called(state: scenario.ScenarioState): + """Verify agent escalates when customer explicitly demands human help""" + assert state.has_tool_call( + "escalate_to_human" + ), "Agent should escalate when customer demands manager/human help" + + tool_call = state.last_tool_call("escalate_to_human") + if tool_call: + args = _parse_tool_arguments(tool_call) + reason = args.get("reason", "").lower() + assert any( + keyword in reason + for keyword in ["frustrated", "manager", "human", "escalation"] + ), "Escalation reason should reflect customer's frustration and demand" + + result = await scenario.run( + name="customer escalation to human agent - MiniMax", + description=""" + Customer has been dealing with an ongoing issue and is frustrated. + They explicitly demand to speak with a human agent or manager. + The agent should handle this professionally and escalate appropriately. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges customer's frustration empathetically", + "Agent offers to escalate when requested", + "Agent provides escalation timeline and process information", + "Agent maintains professionalism despite customer frustration", + ] + ), + ], + script=[ + scenario.user( + "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!" + ), + scenario.agent(), + scenario.user( + "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service." + ), + scenario.agent(), + check_escalation_called, + scenario.judge(), + ], + ) + + _assert_success(result, "Escalation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_complex_issue_triggers_knowledge_base(): + result = await scenario.run( + name="complex multi-issue banking problem - MiniMax", + description=""" + Customer has multiple interconnected banking problems: locked online banking, + unexpected fees, and missing direct deposit. They need systematic help. + This tests whether the agent can handle multiple issues comprehensively. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)", + "Agent provides systematic approach with clear steps for each issue", + "Agent shows empathy for customer's stress and urgency", + "Agent prioritizes the most urgent issue (locked account for bill payments)", + "Agent offers concrete next steps that the customer can act on", + ] + ), + ], + script=[ + scenario.user( + "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Complex issue test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_urgent_business_scenario(): + result = await scenario.run( + name="urgent business account problem - MiniMax", + description=""" + Business customer has an urgent issue affecting their operations. + They can't access funds to pay employees. This tests whether the agent + recognizes urgency and takes appropriate high-priority action. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent immediately recognizes the business urgency and employee impact", + "Agent responds with high priority and urgency in tone", + "Agent takes concrete action (investigating the freeze or escalating to specialists)", + "Agent provides realistic timeline or sets expectations appropriately", + "Agent offers interim solutions or workarounds if available", + ] + ), + ], + script=[ + scenario.user( + "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001." + ), + scenario.agent(), + scenario.user( + "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Urgent business test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_simple_inquiry_no_tools(): + result = await scenario.run( + name="simple inquiry without tool usage - MiniMax", + description=""" + Customer asks a simple question about branch hours or general banking info. + The agent should answer directly without invoking any tools. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent answers the simple question directly and helpfully", + "Agent does not over-complicate the response", + "Agent maintains a friendly and professional tone", + ] + ), + ], + script=[ + scenario.user( + "What are your customer support hours? I just want to know when I can call if I have an issue." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Simple inquiry test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_spending_analysis_request(): + result = await scenario.run( + name="spending analysis and budgeting help - MiniMax", + description=""" + Customer wants to understand their spending patterns and get budgeting advice. + The agent should use explore_customer_account to analyze their transactions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent uses account exploration tools to analyze spending", + "Agent provides specific insights about spending categories", + "Agent offers actionable budgeting advice or recommendations", + "Agent is helpful and non-judgmental about spending habits", + ] + ), + ], + script=[ + scenario.user( + "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?" + ), + scenario.agent(), + scenario.user( + "That's really helpful. Are there any areas where you think I could cut back?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Spending analysis test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_lost_card_replacement(): + result = await scenario.run( + name="lost card replacement workflow - MiniMax", + description=""" + Customer has lost their debit card and needs a replacement. + Tests whether the agent handles the card replacement process properly + including immediate security measures. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent treats lost card with appropriate urgency", + "Agent suggests freezing or blocking the lost card immediately", + "Agent explains the replacement card process and timeline", + "Agent asks about any unauthorized transactions since the card was lost", + "Agent reassures the customer about account security", + ] + ), + ], + script=[ + scenario.user( + "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Lost card replacement test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_overdraft_fee_dispute(): + result = await scenario.run( + name="overdraft fee dispute and resolution - MiniMax", + description=""" + Customer with a basic checking account notices an overdraft fee and wants + it reversed. Tests empathy, account investigation, and fee resolution. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent shows empathy for the customer's frustration about the fee", + "Agent investigates the account to understand the overdraft situation", + "Agent explains how the overdraft fee occurred", + "Agent offers a resolution path (fee waiver, escalation, or explanation)", + "Agent suggests ways to avoid future overdraft fees", + ] + ), + ], + script=[ + scenario.user( + "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002." + ), + scenario.agent(), + scenario.user( + "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Overdraft fee dispute test") + + +if __name__ == "__main__": + asyncio.run(test_fraud_investigation_workflow()) diff --git a/tests-demo/test_demo_openai.py b/tests-demo/test_demo_openai.py new file mode 100644 index 0000000..198175d --- /dev/null +++ b/tests-demo/test_demo_openai.py @@ -0,0 +1,453 @@ +""" +Tests for the main bank customer support agent - OpenAI Model + +These tests cover real business scenarios and validate tool calling behavior +using OpenAI claude-sonnet-4-5-mini model for evaluation. +""" +import asyncio +import pytest +import json +import sys +import os +import dotenv + +# Add parent directory to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import scenario +from main_support_agent_openai import support_agent + +dotenv.load_dotenv() +scenario.configure(default_model="openai/gpt-4o") + + +def _parse_tool_arguments(tool_call: dict) -> dict: + raw_args = tool_call["function"].get("arguments", {}) + if isinstance(raw_args, dict): + return raw_args + if isinstance(raw_args, str): + try: + return json.loads(raw_args) + except json.JSONDecodeError: + return {} + return {} + + +def _assert_success(result: scenario.ScenarioResult, test_name: str) -> None: + assert result.success, f"{test_name} failed: {result.reasoning or 'No failure reasoning returned'}" + + +def _build_tool_trace_messages(response) -> list[dict]: + messages: list[dict] = [] + for i, tool in enumerate(response.tools or []): + tool_call_id = tool.tool_call_id or f"tool_call_{i}" + tool_name = tool.tool_name or "unknown_tool" + tool_args = tool.tool_args if isinstance(tool.tool_args, dict) else {} + tool_result = tool.result if isinstance(tool.result, str) else json.dumps(tool.result or {}) + + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": json.dumps(tool_args), + }, + } + ], + } + ) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": tool_result, + } + ) + + if isinstance(response.content, str) and response.content: + messages.append({"role": "assistant", "content": response.content}) + + return messages + + +class BankSupportAgentAdapter(scenario.AgentAdapter): + """Adapter for our main bank support agent""" + + async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes: + message_content = input.last_new_user_message_str() + response = support_agent.run(message_content) + + # Convert Agno messages to OpenAI format for Scenario + openai_messages = [] + for message in response.messages or []: + if message.role in ["assistant", "user", "system", "tool"]: + msg_dict = {"role": message.role, "content": message.content} + + # Add tool calls if present (for assistant messages) + if message.tool_calls: + msg_dict["tool_calls"] = message.tool_calls + + # Add tool call ID if present (for tool messages) + if hasattr(message, "tool_call_id") and message.tool_call_id: + msg_dict["tool_call_id"] = message.tool_call_id + + openai_messages.append(msg_dict) + + # Return all messages except system and user (Scenario manages the conversation flow) + # We need to include tool messages to satisfy OpenAI's requirements + relevant_messages = [ + msg for msg in openai_messages if msg["role"] in ["assistant", "tool"] + ] + + if relevant_messages: + has_tool_calls = any( + msg["role"] == "assistant" and "tool_calls" in msg for msg in relevant_messages + ) + if has_tool_calls: + return relevant_messages + + synthetic_messages = _build_tool_trace_messages(response) + if synthetic_messages: + return synthetic_messages + + # Fallback to content if no relevant messages found + return response.content # type: ignore + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_fraud_investigation_workflow(): + result = await scenario.run( + name="fraud investigation and card security - OpenAI", + description=""" + Customer discovers unauthorized transactions on their account and is worried about fraud. + They need immediate help to secure their account and investigate the suspicious activity. + This tests whether the agent responds with appropriate urgency and offers concrete security actions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent takes fraud concerns seriously and responds with urgency", + "Agent gathers necessary information (account details) to investigate", + "Agent offers concrete security actions like card freezing or blocking", + "Agent provides clear next steps for fraud investigation and dispute process", + "Agent maintains professional and reassuring tone throughout", + "Agent does not re-ask for customer ID that was already provided", + ] + ), + ], + script=[ + scenario.user( + "Hi, I just checked my account and there are transactions I didn't make. I think my card was stolen!" + ), + scenario.agent(), + scenario.user( + "My customer ID is CUST_001. There's an $85 charge at Amazon and a $45 charge at some gas station yesterday. I definitely didn't make these purchases." + ), + scenario.agent(), + scenario.user( + "Yes, please help me secure my account right away. I'm really worried about more charges appearing." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Fraud investigation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_escalation_workflow(): + def check_escalation_called(state: scenario.ScenarioState): + """Verify agent escalates when customer explicitly demands human help""" + assert state.has_tool_call( + "escalate_to_human" + ), "Agent should escalate when customer demands manager/human help" + + tool_call = state.last_tool_call("escalate_to_human") + if tool_call: + args = _parse_tool_arguments(tool_call) + reason = args.get("reason", "").lower() + assert any( + keyword in reason + for keyword in ["frustrated", "manager", "human", "escalation"] + ), "Escalation reason should reflect customer's frustration and demand" + + result = await scenario.run( + name="customer escalation to human agent - OpenAI", + description=""" + Customer has been dealing with an ongoing issue and is frustrated. + They explicitly demand to speak with a human agent or manager. + The agent should handle this professionally and escalate appropriately. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges customer's frustration empathetically", + "Agent offers to escalate when requested", + "Agent provides escalation timeline and process information", + "Agent maintains professionalism despite customer frustration", + ] + ), + ], + script=[ + scenario.user( + "I've been calling about this same issue for two weeks and nobody can fix it. I want to speak to a real person who can actually help me!" + ), + scenario.agent(), + scenario.user( + "No more troubleshooting! I want a manager or supervisor right now. This is unacceptable service." + ), + scenario.agent(), + check_escalation_called, + scenario.judge(), + ], + ) + + _assert_success(result, "Escalation test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_complex_issue_triggers_knowledge_base(): + result = await scenario.run( + name="complex multi-issue banking problem - OpenAI", + description=""" + Customer has multiple interconnected banking problems: locked online banking, + unexpected fees, and missing direct deposit. They need systematic help. + This tests whether the agent can handle multiple issues comprehensively. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent acknowledges ALL three issues (locked banking, fee, missing deposit)", + "Agent provides systematic approach with clear steps for each issue", + "Agent shows empathy for customer's stress and urgency", + "Agent prioritizes the most urgent issue (locked account for bill payments)", + "Agent offers concrete next steps that the customer can act on", + ] + ), + ], + script=[ + scenario.user( + "I have multiple problems with my account. My online banking is locked, there's a $35 fee I don't understand, and my paycheck didn't deposit. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I've tried resetting my password multiple times and I really need access to pay my bills today. This is really stressing me out." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Complex issue test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_urgent_business_scenario(): + result = await scenario.run( + name="urgent business account problem - OpenAI", + description=""" + Business customer has an urgent issue affecting their operations. + They can't access funds to pay employees. This tests whether the agent + recognizes urgency and takes appropriate high-priority action. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent immediately recognizes the business urgency and employee impact", + "Agent responds with high priority and urgency in tone", + "Agent takes concrete action (investigating the freeze or escalating to specialists)", + "Agent provides realistic timeline or sets expectations appropriately", + "Agent offers interim solutions or workarounds if available", + ] + ), + ], + script=[ + scenario.user( + "URGENT: My business account is frozen and I need to pay my employees today. This is costing me money every minute! My business account number is CUST_001." + ), + scenario.agent(), + scenario.user( + "I can't wait. My payroll is due in 2 hours and my employees are depending on me. What can you do right now?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Urgent business test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_simple_inquiry_no_tools(): + result = await scenario.run( + name="simple inquiry without tool usage - OpenAI", + description=""" + Customer asks a simple question about branch hours or general banking info. + The agent should answer directly without invoking any tools. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent answers the simple question directly and helpfully", + "Agent does not over-complicate the response", + "Agent maintains a friendly and professional tone", + ] + ), + ], + script=[ + scenario.user( + "What are your customer support hours? I just want to know when I can call if I have an issue." + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Simple inquiry test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_spending_analysis_request(): + result = await scenario.run( + name="spending analysis and budgeting help - OpenAI", + description=""" + Customer wants to understand their spending patterns and get budgeting advice. + The agent should use explore_customer_account to analyze their transactions. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent uses account exploration tools to analyze spending", + "Agent provides specific insights about spending categories", + "Agent offers actionable budgeting advice or recommendations", + "Agent is helpful and non-judgmental about spending habits", + ] + ), + ], + script=[ + scenario.user( + "My customer ID is CUST_001. I feel like I'm spending too much lately. Can you help me understand where my money is going?" + ), + scenario.agent(), + scenario.user( + "That's really helpful. Are there any areas where you think I could cut back?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Spending analysis test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_lost_card_replacement(): + result = await scenario.run( + name="lost card replacement workflow - OpenAI", + description=""" + Customer has lost their debit card and needs a replacement. + Tests whether the agent handles the card replacement process properly + including immediate security measures. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent treats lost card with appropriate urgency", + "Agent suggests freezing or blocking the lost card immediately", + "Agent explains the replacement card process and timeline", + "Agent asks about any unauthorized transactions since the card was lost", + "Agent reassures the customer about account security", + ] + ), + ], + script=[ + scenario.user( + "I lost my debit card somewhere yesterday. I've looked everywhere and can't find it. My customer ID is CUST_001." + ), + scenario.agent(), + scenario.user( + "I don't think anyone has used it, but I'm not sure. Can you check and get me a new card?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Lost card replacement test") + + +@pytest.mark.agent_test +@pytest.mark.asyncio +async def test_overdraft_fee_dispute(): + result = await scenario.run( + name="overdraft fee dispute and resolution - OpenAI", + description=""" + Customer with a basic checking account notices an overdraft fee and wants + it reversed. Tests empathy, account investigation, and fee resolution. + """, + agents=[ + BankSupportAgentAdapter(), + scenario.UserSimulatorAgent(model="openai/gpt-4o"), + scenario.JudgeAgent( + model="openai/gpt-4o", + criteria=[ + "Agent shows empathy for the customer's frustration about the fee", + "Agent investigates the account to understand the overdraft situation", + "Agent explains how the overdraft fee occurred", + "Agent offers a resolution path (fee waiver, escalation, or explanation)", + "Agent suggests ways to avoid future overdraft fees", + ] + ), + ], + script=[ + scenario.user( + "I just saw a $35 overdraft fee on my account and I'm really upset. I had money in there! My customer ID is CUST_002." + ), + scenario.agent(), + scenario.user( + "This isn't fair. I've been a customer for 2 years and this is the first time this has happened. Can you waive the fee?" + ), + scenario.agent(), + scenario.judge(), + ], + ) + + _assert_success(result, "Overdraft fee dispute test") + + +if __name__ == "__main__": + asyncio.run(test_fraud_investigation_workflow()) diff --git a/tests/test_main_support_agent.py b/tests/test_main_support_agent.py index 1376f81..d28f4ff 100644 --- a/tests/test_main_support_agent.py +++ b/tests/test_main_support_agent.py @@ -17,7 +17,7 @@ from main_support_agent import support_agent dotenv.load_dotenv() -scenario.configure(default_model="openai/gpt-4o-mini") +scenario.configure(default_model="nebius/openai/gpt-oss-120b") class BankSupportAgentAdapter(scenario.AgentAdapter):